From 031a7995f38d3c73b0790280cc0fa1fe25d33bff Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Thu, 31 Oct 2024 19:09:46 -0600
Subject: [PATCH 01/43] [Bugfix][Frontend] Reject guided decoding in multistep
 mode (#9892)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 docs/source/serving/compatibility_matrix.rst  |  2 +-
 .../openai/test_prompt_validation.py          | 20 +++++++++++++++++++
 vllm/engine/llm_engine.py                     |  7 +++++++
 vllm/sampling_params.py                       |  4 ++--
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index 20a81f4cad1d1..cab19e4ec5b6c 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -283,7 +283,7 @@ Feature x Feature
      - ✅
      - ✅
      - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8985>`__ 
+     - `✗ <https://github.com/vllm-project/vllm/issues/9893>`__ 
      - ?
      - ✅
      - ✅
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 58075f7023821..1ae64ef492d5b 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -35,3 +35,23 @@ async def test_out_of_vocab_token_ids():
                                             prompt=[999999],
                                             max_tokens=5,
                                             temperature=0.0)
+
+
+@pytest.mark.asyncio
+async def test_reject_multistep_with_guided_decoding():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(openai.BadRequestError,
+                           match=re.compile(
+                               '.*Guided decoding .* multi-step decoding.*')):
+            await client.completions.create(
+                model=model_name,
+                prompt="Hello",
+                max_tokens=5,
+                temperature=0.0,
+                extra_body={"response_format": {
+                    "type": "json_object"
+                }})
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3fd34fadee1ca..edef1f30a9e91 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -829,6 +829,13 @@ def add_request(
             raise ValueError(f"Got priority {priority} but "
                              "Priority scheduling is not enabled.")
 
+        if isinstance(params, SamplingParams) \
+            and (params.guided_decoding or params.logits_processors) \
+            and self.scheduler_config.num_scheduler_steps > 1:
+            raise ValueError(
+                "Guided decoding and logits processors are not supported "
+                "in multi-step decoding")
+
         if arrival_time is None:
             arrival_time = time.time()
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 5e191c6e715e0..5c6df5aaf5446 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -485,8 +485,8 @@ def __repr__(self) -> str:
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens}), "
-            f"guided_decoding={self.guided_decoding}")
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
+            f"guided_decoding={self.guided_decoding})")
 
 
 class BeamSearchParams(

From 96e0c9cbbd65ad0b8ad20611b90bcc86a8559aae Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 31 Oct 2024 21:56:09 -0700
Subject: [PATCH 02/43] [torch.compile] directly register custom op (#9896)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_simple.py        | 20 ++++--
 tests/compile/piecewise/test_toy_llama.py     | 20 ++++--
 vllm/attention/backends/flash_attn.py         | 16 +++--
 vllm/attention/backends/flashinfer.py         | 17 +++--
 vllm/distributed/parallel_state.py            | 34 +++++++---
 .../layers/fused_moe/fused_marlin_moe.py      | 25 +++++--
 .../layers/fused_moe/fused_moe.py             | 68 +++++++++++--------
 vllm/utils.py                                 | 45 ++++++++++++
 vllm/v1/attention/backends/flash_attn.py      | 14 ++--
 9 files changed, 192 insertions(+), 67 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index a34d33efba1d8..d151d62516b07 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -6,18 +6,22 @@
 
 import torch
 from torch import nn
+from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
+from vllm.utils import direct_register_custom_op
 
 os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
 
 global_counter = 0
 
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
 
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
 def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     out: torch.Tensor) -> None:
     global global_counter
@@ -27,12 +31,20 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     out[0] += 1
 
 
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-      out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
     return
 
 
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
 @support_torch_compile
 class SillyModel(nn.Module):
 
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index db6a983d70feb..e3e5a7d0fc5a5 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,6 +8,7 @@
 
 import torch
 from torch import nn
+from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.config import CompilationConfig
@@ -15,9 +16,12 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
 from vllm.plugins import set_compilation_config
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
 
 
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
 def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     out: torch.Tensor) -> None:
     out.copy_(q)
@@ -25,12 +29,20 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     out += v
 
 
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-      out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
     return
 
 
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
 @dataclass
 class LlamaConfig:
     hidden_size: int = 128
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ffa05e80623ac..c294fcf7f08fe 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -14,7 +14,8 @@
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
 from vllm.forward_context import get_forward_context
-from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
+                        make_tensor_with_pad)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -595,8 +596,6 @@ def forward(
         return output
 
 
-@torch.library.custom_op("vllm::unified_flash_attention",
-                         mutates_args=["kv_cache"])
 def unified_flash_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -755,8 +754,7 @@ def unified_flash_attention(
     return output.view(num_tokens, hidden_size)
 
 
-@unified_flash_attention.register_fake
-def _(
+def unified_flash_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -773,3 +771,11 @@ def _(
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(query)
+
+
+direct_register_custom_op(
+    op_name="unified_flash_attention",
+    op_func=unified_flash_attention,
+    mutates_args=["kv_cache"],
+    fake_impl=unified_flash_attention_fake,
+)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 5ea101ae0432f..234c87d5c4edb 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -28,8 +28,8 @@
                                            is_block_tables_empty)
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.forward_context import get_forward_context
-from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
-                        make_tensor_with_pad)
+from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
+                        get_kv_cache_torch_dtype, make_tensor_with_pad)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -785,8 +785,6 @@ def forward(
         )
 
 
-@torch.library.custom_op("vllm::unified_flash_infer",
-                         mutates_args=["kv_cache"])
 def unified_flash_infer(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -906,8 +904,7 @@ def unified_flash_infer(
     return output.view(num_tokens, hidden_size)
 
 
-@unified_flash_infer.register_fake
-def _(
+def unified_flash_infer_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -924,3 +921,11 @@ def _(
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
+
+
+direct_register_custom_op(
+    op_name="unified_flash_infer",
+    op_func=unified_flash_infer,
+    mutates_args=["kv_cache"],
+    fake_impl=unified_flash_infer_fake,
+)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b04bbc478534c..94ba41a016f6d 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,7 +37,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import supports_custom_op
+from vllm.utils import direct_register_custom_op, supports_custom_op
 
 
 @dataclass
@@ -99,8 +99,6 @@ def _register_group(group: "GroupCoordinator") -> None:
 
 if supports_custom_op():
 
-    @torch.library.custom_op("vllm::inplace_all_reduce",
-                             mutates_args=["tensor"])
     def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
         assert group_name in _groups, f"Group {group_name} is not found."
         group = _groups[group_name]()
@@ -108,11 +106,16 @@ def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
             raise ValueError(f"Group {group_name} is destroyed.")
         group._all_reduce_in_place(tensor)
 
-    @inplace_all_reduce.register_fake
-    def _(tensor: torch.Tensor, group_name: str) -> None:
+    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
         return
 
-    @torch.library.custom_op("vllm::outplace_all_reduce", mutates_args=[])
+    direct_register_custom_op(
+        op_name="inplace_all_reduce",
+        op_func=inplace_all_reduce,
+        mutates_args=["tensor"],
+        fake_impl=inplace_all_reduce_fake,
+    )
+
     def outplace_all_reduce(tensor: torch.Tensor,
                             group_name: str) -> torch.Tensor:
         assert group_name in _groups, f"Group {group_name} is not found."
@@ -121,10 +124,17 @@ def outplace_all_reduce(tensor: torch.Tensor,
             raise ValueError(f"Group {group_name} is destroyed.")
         return group._all_reduce_out_place(tensor)
 
-    @outplace_all_reduce.register_fake
-    def _(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    def outplace_all_reduce_fake(tensor: torch.Tensor,
+                                 group_name: str) -> torch.Tensor:
         return torch.empty_like(tensor)
 
+    direct_register_custom_op(
+        op_name="outplace_all_reduce",
+        op_func=outplace_all_reduce,
+        mutates_args=[],
+        fake_impl=outplace_all_reduce_fake,
+    )
+
 
 class GroupCoordinator:
     """
@@ -338,6 +348,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         if self.world_size == 1:
             return input_
 
+        if input_.is_cpu:
+            import intel_extension_for_pytorch as ipex
+            ipex.distributed.all_reduce(input_, group=self.device_group)
+            return input_
+
         if not supports_custom_op():
             self._all_reduce_in_place(input_)
             return input_
@@ -369,9 +384,6 @@ def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
         pynccl_comm = self.pynccl_comm
         if (pynccl_comm is not None and not pynccl_comm.disabled):
             pynccl_comm.all_reduce(input_)
-        elif input_.is_cpu:
-            import intel_extension_for_pytorch as ipex
-            ipex.distributed.all_reduce(input_, group=self.device_group)
         else:
             torch.distributed.all_reduce(input_, group=self.device_group)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 93019d0d0abb6..4741d69de11ac 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -8,6 +8,7 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
+from vllm.utils import direct_register_custom_op
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -18,7 +19,6 @@ def get_scalar_type(num_bits: int, has_zp: bool):
         return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
 
 
-@torch.library.custom_op("vllm::single_marlin_moe", mutates_args=[])
 def single_marlin_moe(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
@@ -119,8 +119,7 @@ def single_marlin_moe(
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
 
-@single_marlin_moe.register_fake
-def _(
+def single_marlin_moe_fake(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
     scales: torch.Tensor,
@@ -136,7 +135,14 @@ def _(
     return torch.empty_like(hidden_states)
 
 
-@torch.library.custom_op("vllm::fused_marlin_moe", mutates_args=[])
+direct_register_custom_op(
+    op_name="single_marlin_moe",
+    op_func=single_marlin_moe,
+    mutates_args=[],
+    fake_impl=single_marlin_moe_fake,
+)
+
+
 def fused_marlin_moe(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -324,8 +330,7 @@ def fused_marlin_moe(
                      dim=1)
 
 
-@fused_marlin_moe.register_fake
-def _(
+def fused_marlin_moe_fake(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
@@ -344,3 +349,11 @@ def _(
     is_k_full: bool = True,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="fused_marlin_moe",
+    op_func=fused_marlin_moe,
+    mutates_args=[],
+    fake_impl=fused_marlin_moe_fake,
+)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1cf5c2253ca0b..340da32263c1c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -12,6 +12,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
@@ -466,8 +467,6 @@ def get_config_dtype_str(dtype: torch.dtype,
     return None
 
 
-@torch.library.custom_op("vllm::inplace_fused_experts",
-                         mutates_args=["hidden_states"])
 def inplace_fused_experts(hidden_states: torch.Tensor,
                           w1: torch.Tensor,
                           w2: torch.Tensor,
@@ -484,22 +483,29 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                        a1_scale, a2_scale)
 
 
-@inplace_fused_experts.register_fake
-def _(hidden_states: torch.Tensor,
-      w1: torch.Tensor,
-      w2: torch.Tensor,
-      topk_weights: torch.Tensor,
-      topk_ids: torch.Tensor,
-      use_fp8_w8a8: bool = False,
-      use_int8_w8a16: bool = False,
-      w1_scale: Optional[torch.Tensor] = None,
-      w2_scale: Optional[torch.Tensor] = None,
-      a1_scale: Optional[torch.Tensor] = None,
-      a2_scale: Optional[torch.Tensor] = None) -> None:
+def inplace_fused_experts_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None) -> None:
     pass
 
 
-@torch.library.custom_op("vllm::outplace_fused_experts", mutates_args=[])
+direct_register_custom_op(
+    op_name="inplace_fused_experts",
+    op_func=inplace_fused_experts,
+    mutates_args=["hidden_states"],
+    fake_impl=inplace_fused_experts_fake,
+)
+
+
 def outplace_fused_experts(
         hidden_states: torch.Tensor,
         w1: torch.Tensor,
@@ -517,21 +523,29 @@ def outplace_fused_experts(
                               w2_scale, a1_scale, a2_scale)
 
 
-@outplace_fused_experts.register_fake
-def _(hidden_states: torch.Tensor,
-      w1: torch.Tensor,
-      w2: torch.Tensor,
-      topk_weights: torch.Tensor,
-      topk_ids: torch.Tensor,
-      use_fp8_w8a8: bool = False,
-      use_int8_w8a16: bool = False,
-      w1_scale: Optional[torch.Tensor] = None,
-      w2_scale: Optional[torch.Tensor] = None,
-      a1_scale: Optional[torch.Tensor] = None,
-      a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+def outplace_fused_experts_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
+direct_register_custom_op(
+    op_name="outplace_fused_experts",
+    op_func=outplace_fused_experts,
+    mutates_args=[],
+    fake_impl=outplace_fused_experts_fake,
+)
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
diff --git a/vllm/utils.py b/vllm/utils.py
index 03cdbe6a0dc7b..5488719cc99b0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -32,6 +32,7 @@
 import torch.types
 import yaml
 from packaging.version import Version
+from torch.library import Library
 from typing_extensions import ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
@@ -1512,3 +1513,47 @@ def weak_ref_tensors(
     if isinstance(tensors, tuple):
         return tuple(weak_ref_tensor(t) for t in tensors)
     raise ValueError("Invalid type for tensors")
+
+
+def is_in_doc_build() -> bool:
+    try:
+        from sphinx.ext.autodoc.mock import _MockModule
+        return isinstance(torch, _MockModule)
+    except ModuleNotFoundError:
+        return False
+
+
+# create a library to hold the custom op
+vllm_lib = Library("vllm", "FRAGMENT")  # noqa
+
+
+def direct_register_custom_op(
+    op_name: str,
+    op_func: Callable,
+    mutates_args: List[str],
+    fake_impl: Optional[Callable] = None,
+    target_lib: Optional[Library] = None,
+):
+    """
+    `torch.library.custom_op` can have significant overhead because it
+    needs to consider complicated dispatching logic. This function
+    directly registers a custom op and dispatches it to the CUDA backend.
+    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+    for more details.
+
+    By default, the custom op is registered to the vLLM library. If you
+    want to register it to a different library, you can pass the library
+    object to the `target_lib` argument.
+
+    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
+    library object. If you want to bind the operator to a different library,
+    make sure the library object is alive when the operator is used.
+    """
+    if is_in_doc_build():
+        return
+    schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    my_lib = target_lib or vllm_lib
+    my_lib.define(op_name + schema_str)
+    my_lib.impl(op_name, op_func, "CUDA")
+    if fake_impl is not None:
+        my_lib._register_fake(op_name, fake_impl)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index ec07464e6a12a..b2af89ebf854a 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.forward_context import get_forward_context
+from vllm.utils import direct_register_custom_op
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 
@@ -152,8 +153,6 @@ def forward(
         return output
 
 
-@torch.library.custom_op("vllm::unified_flash_attention",
-                         mutates_args=["kv_cache"])
 def unified_flash_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -217,8 +216,7 @@ def unified_flash_attention(
     return output.view(num_tokens, hidden_size)
 
 
-@unified_flash_attention.register_fake
-def _(
+def unified_flash_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -235,3 +233,11 @@ def _(
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(query)
+
+
+direct_register_custom_op(
+    op_name="unified_flash_attention",
+    op_func=unified_flash_attention,
+    mutates_args=["kv_cache"],
+    fake_impl=unified_flash_attention_fake,
+)

From 37a4947dcd68c602d0911920e2c1a9168dea1ecb Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 1 Nov 2024 01:12:44 -0400
Subject: [PATCH 03/43] [Bugfix] Fix layer skip logic with bitsandbytes (#9887)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/bitsandbytes.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 7a039a78f09b8..718967a065192 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -119,7 +119,12 @@ def get_scaled_act_names(self) -> List[str]:
 
 
 def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]):
-    return any(module_name in prefix for module_name in llm_int8_skip_modules)
+    # Split the prefix into its dot-separated components
+    components = prefix.split('.')
+
+    # Check if any of the skip modules exactly matches any component
+    return any(module_name in components
+               for module_name in llm_int8_skip_modules)
 
 
 class BitsAndBytesLinearMethod(LinearMethodBase):

From 566cd277979bc1a46b7e99657112416af9874a58 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 31 Oct 2024 22:20:17 -0700
Subject: [PATCH 04/43] [torch.compile] rework test plans (#9866)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/test_basic_correctness.py | 113 +++++++++++++++++----
 tests/utils.py                          | 124 +++++++++++++++++++++++-
 vllm/model_executor/models/llava.py     |  10 +-
 vllm/model_executor/models/phi3v.py     |  10 +-
 4 files changed, 226 insertions(+), 31 deletions(-)

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 2f92ff73845f5..833589ba5dc9f 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,3 +1,4 @@
+import dataclasses
 from typing import Dict, List, Optional
 
 import pytest
@@ -8,33 +9,109 @@
 from ..utils import compare_all_settings
 
 
+@dataclasses.dataclass
+class TestSetting:
+    model: str
+    model_args: List[str]
+    pp_size: int
+    tp_size: int
+    attn_backend: str
+    method: str
+    fullgraph: bool
+
+
+# representative settings for testing
+test_settings = [
+    # basic llama model
+    TestSetting(
+        model="meta-llama/Llama-3.2-1B",
+        model_args=[],
+        pp_size=2,
+        tp_size=2,
+        attn_backend="FLASHINFER",
+        method="generate",
+        fullgraph=True,
+    ),
+    # llama model with quantization
+    TestSetting(
+        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+        model_args=["--quantization", "gptq"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # MoE model
+    TestSetting(
+        model="ibm/PowerMoE-3b",
+        model_args=[],
+        pp_size=1,
+        tp_size=2,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # embedding model
+    TestSetting(
+        model="BAAI/bge-multilingual-gemma2",
+        model_args=["--task", "embedding"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASHINFER",
+        method="encode",
+        fullgraph=True,
+    ),
+    # vision language model
+    TestSetting(
+        model="microsoft/Phi-3.5-vision-instruct",
+        model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        pp_size=2,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate_with_image",
+        fullgraph=False,
+    ),
+]
+
+
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize(
-    "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
-    [
-        ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True),
-        ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
-         ["--quantization", "compressed-tensors"
-          ], 1, 1, "FLASH_ATTN", "generate", True),
-        ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True),
-        # TODO: add multi-modality test for llava
-        ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False)
-    ])
-def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
-                             method, fullgraph):
+@pytest.mark.parametrize("test_setting", test_settings)
+def test_compile_correctness(test_setting: TestSetting):
     # this test is run under multiple suits, with different GPUs.
     # make sure we only run the test with correct CUDA devices.
     # don't use "<", as it will duplicate the tests.
+    model = test_setting.model
+    model_args = test_setting.model_args
+    pp_size = test_setting.pp_size
+    tp_size = test_setting.tp_size
+    attn_backend = test_setting.attn_backend
+    method = test_setting.method
+    fullgraph = test_setting.fullgraph
     if cuda_device_count_stateless() != pp_size * tp_size:
         pytest.skip("Not correct CUDA devices for the test.")
     import os
     os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
-    all_args = [["--enforce-eager"] + model_args + ["-pp", str(pp_size)] +
-                ["-tp", str(tp_size)]] * 3
-    # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
-    # inductor will change the output, so we cannot compare them.
+    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
+                ["-tp", str(tp_size)]
+
     all_envs: List[Optional[Dict[str, str]]] = []
+
+    for level in [
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.PIECEWISE,
+    ]:
+        all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+
+    # inductor will change the output, so we only compare if the output
+    # is close, not exactly the same.
+    compare_all_settings(
+        model, [final_args] * 2,
+        all_envs,
+        method=method if method != "generate" else "generate_close")
+    all_envs.clear()
+
     for level in [
             CompilationLevel.NO_COMPILATION,
             CompilationLevel.DYNAMO_AS_IS,
@@ -46,4 +123,4 @@ def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
             all_envs[-1][
                 "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
 
-    compare_all_settings(model, all_args, all_envs, method=method)
+    compare_all_settings(model, [final_args] * 3, all_envs, method=method)
diff --git a/tests/utils.py b/tests/utils.py
index e8aad9cb3268f..16e21f68c7c96 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import functools
 import os
 import signal
@@ -8,13 +9,14 @@
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import openai
 import pytest
 import requests
+import torch
 from openai.types.completion import Completion
-from typing_extensions import ParamSpec, assert_never
+from typing_extensions import ParamSpec
 
 import vllm.envs as envs
 from tests.models.utils import TextTextLogprobs
@@ -272,6 +274,31 @@ def _test_completion(
     return results
 
 
+def _test_completion_close(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    # test with text prompt
+    completion = client.completions.create(model=model,
+                                           prompt=prompt,
+                                           max_tokens=1,
+                                           logprobs=5,
+                                           temperature=0.0)
+
+    logporbs = completion.choices[0].logprobs.top_logprobs[0]
+    logporbs = {k: round(v, 2) for k, v in logporbs.items()}
+
+    results.append({
+        "test": "completion_close",
+        "logprobs": logporbs,
+    })
+
+    return results
+
+
 def _test_embeddings(
     client: openai.OpenAI,
     model: str,
@@ -295,13 +322,81 @@ def _test_embeddings(
     return results
 
 
+def _test_image_text(
+    client: openai.OpenAI,
+    model_name: str,
+    image_url: str,
+):
+    results = []
+
+    # test pure text input
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "How do you feel today?"
+            },
+        ],
+    }]
+
+    chat_completion = client.chat.completions.create(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.0,
+                                                     max_tokens=1,
+                                                     logprobs=True,
+                                                     top_logprobs=5)
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    for x in top_logprobs:
+        x.logprob = round(x.logprob, 2)
+
+    results.append({
+        "test": "pure_text",
+        "logprobs": top_logprobs,
+    })
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    chat_completion = client.chat.completions.create(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.0,
+                                                     max_tokens=1,
+                                                     logprobs=True,
+                                                     top_logprobs=5)
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    results.append({
+        "test": "text_image",
+        "logprobs": top_logprobs,
+    })
+
+    return results
+
+
 def compare_two_settings(model: str,
                          arg1: List[str],
                          arg2: List[str],
                          env1: Optional[Dict[str, str]] = None,
                          env2: Optional[Dict[str, str]] = None,
                          *,
-                         method: Literal["generate", "encode"] = "generate",
+                         method: str = "generate",
                          max_wait_seconds: Optional[float] = None) -> None:
     """
     Launch API server with two different sets of arguments/environments
@@ -328,7 +423,7 @@ def compare_all_settings(model: str,
                          all_args: List[List[str]],
                          all_envs: List[Optional[Dict[str, str]]],
                          *,
-                         method: Literal["generate", "encode"] = "generate",
+                         method: str = "generate",
                          max_wait_seconds: Optional[float] = None) -> None:
     """
     Launch API server with several different sets of arguments/environments
@@ -397,10 +492,17 @@ def compare_all_settings(model: str,
 
             if method == "generate":
                 results += _test_completion(client, model, prompt, token_ids)
+            elif method == "generate_close":
+                results += _test_completion_close(client, model, prompt)
+            elif method == "generate_with_image":
+                results += _test_image_text(
+                    client, model,
+                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
+                )
             elif method == "encode":
                 results += _test_embeddings(client, model, prompt)
             else:
-                assert_never(method)
+                raise ValueError(f"Unknown method: {method}")
 
             if i > 0:
                 # if any setting fails, raise an error early
@@ -410,6 +512,18 @@ def compare_all_settings(model: str,
                 compare_envs = all_envs[i]
                 for ref_result, compare_result in zip(ref_results,
                                                       compare_results):
+                    ref_result = copy.deepcopy(ref_result)
+                    compare_result = copy.deepcopy(compare_result)
+                    if "embedding" in ref_result and method == "encode":
+                        ref_embedding = torch.tensor(ref_result["embedding"])
+                        compare_embedding = torch.tensor(
+                            compare_result["embedding"])
+                        mse = ((ref_embedding - compare_embedding)**2).mean()
+                        assert mse < 1e-6, (
+                            f"Embedding for {model=} are not the same.\n"
+                            f"mse={mse}\n")
+                        del ref_result["embedding"]
+                        del compare_result["embedding"]
                     assert ref_result == compare_result, (
                         f"Results for {model=} are not the same.\n"
                         f"{ref_args=} {ref_envs=}\n"
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index eda99c029881f..27055e7ced865 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -493,13 +493,9 @@ def forward(
             :class:`LlavaImageInputs`
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
             image_input = self._parse_and_validate_image_input(**kwargs)
-
             if image_input is not None:
                 vision_embeddings = self._process_image_input(image_input)
                 inputs_embeds = self.language_model.model.get_input_embeddings(
@@ -511,7 +507,11 @@ def forward(
             else:
                 inputs_embeds = self.language_model.model.get_input_embeddings(
                     input_ids)
-            input_ids = None
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fc4556831fd7..4928e447d5b9e 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -679,7 +679,6 @@ def forward(self,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 **kwargs: object):
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
             image_input = self._parse_and_validate_image_input(**kwargs)
@@ -690,9 +689,14 @@ def forward(self,
                 inputs_embeds = merge_multimodal_embeddings(
                     input_ids, inputs_embeds, vision_embeddings,
                     self.image_token_id)
-                input_ids = None
             else:
-                inputs_embeds = None
+                inputs_embeds = self.language_model.model.embed_tokens(
+                    input_ids)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,

From 93a76dd21dcec8977f1ffd0e21faa88fb515b9e4 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 1 Nov 2024 01:31:56 -0400
Subject: [PATCH 05/43] [Model] Support bitsandbytes for MiniCPMV (#9891)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/models/minicpmv.py | 43 ++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a270282d87bc8..4917c33136069 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -810,6 +810,28 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         # resampler
         "kv_proj",
     ]
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     embedding_modules = {}
     embedding_padding_modules = []
 
@@ -931,6 +953,27 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         "kv_proj",
     ]
 
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     embedding_modules = {}
     embedding_padding_modules = []
 

From 2b5bf20988edaab21621b78a9eb589edc93f2763 Mon Sep 17 00:00:00 2001
From: Yongzao <532741407@qq.com>
Date: Fri, 1 Nov 2024 15:25:47 +0800
Subject: [PATCH 06/43] [torch.compile] Adding torch compile annotations to
 some models (#9876)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/models/supported_models.rst     | 2 +-
 tests/distributed/test_pipeline_parallel.py | 2 +-
 vllm/model_executor/models/falcon.py        | 2 ++
 vllm/model_executor/models/phi.py           | 2 ++
 vllm/model_executor/models/qwen.py          | 2 ++
 vllm/model_executor/models/qwen2.py         | 2 ++
 vllm/model_executor/models/qwen2_moe.py     | 2 ++
 7 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 3279e7a108232..e493cebf1e9f4 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -281,7 +281,7 @@ Text Generation
     - ✅︎
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
-    - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
+    - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
     - ✅︎
     - ✅︎
   * - :code:`Qwen2MoeForCausalLM`
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index ed6360f9d6148..1489a60891761 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -166,7 +166,7 @@ def iter_params(self, model_name: str):
     "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
     "adept/persimmon-8b-chat": PPTestSettings.fast(),
     "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
-    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
+    "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
     "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
     "bigcode/starcoder2-3b": PPTestSettings.fast(),
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 467a33505ee12..36c85e37783ab 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -27,6 +27,7 @@
 from transformers import FalconConfig as HF_FalconConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -329,6 +330,7 @@ def forward(
         return output
 
 
+@support_torch_compile
 class FalconModel(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index ec20cb249ba9b..497eae4e8905b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -42,6 +42,7 @@
 from transformers import PhiConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
@@ -193,6 +194,7 @@ def forward(
         return hidden_states
 
 
+@support_torch_compile
 class PhiModel(nn.Module):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 998016ea28c26..61665768eacf5 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -20,6 +20,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
@@ -549,6 +550,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
 class QWenModel(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index db1029345a8ac..db7556b3b5f4b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -29,6 +29,7 @@
 from transformers import Qwen2Config
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -237,6 +238,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
 class Qwen2Model(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index d4475b7ca27af..dac85e35d369d 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -30,6 +30,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
@@ -312,6 +313,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
 class Qwen2MoeModel(nn.Module):
 
     def __init__(

From d3aa2a8b2f93f50ed40fe7d8617701a2294a13e4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 15:34:49 +0800
Subject: [PATCH 07/43] [Doc] Update multi-input support (#9906)

---
 docs/source/models/supported_models.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index e493cebf1e9f4..80714a90df5c2 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -466,7 +466,7 @@ Text Generation
     - ✅︎
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
-    - T + I\ :sup:`+` + V
+    - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
@@ -478,7 +478,7 @@ Text Generation
     - ✅︎
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
-    - T + I
+    - T + I\ :sup:`+`
     - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
     -
     -

From 06386a64dd706cf3fdab82510124ca2c2f9eee9d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 16:13:35 +0800
Subject: [PATCH 08/43] [Frontend] Chat-based Embeddings API (#9759)

---
 docs/requirements-docs.txt                    |   2 +
 docs/source/conf.py                           |   2 +-
 docs/source/dev/pooling_params.rst            |   5 +
 docs/source/getting_started/quickstart.rst    |   8 +-
 docs/source/index.rst                         |   1 +
 docs/source/models/vlm.rst                    |  54 ++++-
 .../serving/openai_compatible_server.md       |  55 ++++-
 tests/entrypoints/openai/test_basic.py        |  13 +-
 tests/entrypoints/openai/test_embedding.py    | 137 +++++++----
 tests/entrypoints/openai/test_metrics.py      |  14 +-
 tests/entrypoints/openai/test_tokenization.py |  32 +--
 .../openai/test_vision_embedding.py           |  94 ++++++++
 vllm/entrypoints/openai/api_server.py         |  96 +++++---
 vllm/entrypoints/openai/protocol.py           |  87 ++++++-
 vllm/entrypoints/openai/run_batch.py          |  34 ++-
 vllm/entrypoints/openai/serving_chat.py       | 222 +++++++-----------
 vllm/entrypoints/openai/serving_completion.py |  75 +++---
 vllm/entrypoints/openai/serving_embedding.py  |  87 ++++---
 vllm/entrypoints/openai/serving_engine.py     | 159 ++++++++++++-
 .../openai/serving_tokenization.py            |  87 +++----
 vllm/pooling_params.py                        |   4 +-
 21 files changed, 853 insertions(+), 415 deletions(-)
 create mode 100644 docs/source/dev/pooling_params.rst
 create mode 100644 tests/entrypoints/openai/test_vision_embedding.py

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index d58f226136918..e3e35844405ac 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -13,5 +13,7 @@ torch
 py-cpuinfo
 transformers
 mistral_common >= 1.3.4
+aiohttp
+starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8435129e752e1..c7b638473a931 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -96,7 +96,6 @@ def setup(app):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
-    "aiohttp",
     "compressed_tensors",
     "cpuinfo",
     "cv2",
@@ -143,6 +142,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
     "python": ("https://docs.python.org/3", None),
     "typing_extensions":
     ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
     "pillow": ("https://pillow.readthedocs.io/en/stable", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "torch": ("https://pytorch.org/docs/stable", None),
diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.rst
new file mode 100644
index 0000000000000..334e0287aff09
--- /dev/null
+++ b/docs/source/dev/pooling_params.rst
@@ -0,0 +1,5 @@
+Pooling Parameters
+==================
+
+.. autoclass:: vllm.PoolingParams
+    :members:
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index f0e6cddf09ef7..00b762ccc2ccb 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -138,10 +138,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
 
 A more detailed client example can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`__.
 
-OpenAI Chat API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAI Chat Completions API with vLLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
 
 You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to interact with the model:
 
@@ -157,7 +157,7 @@ You can use the `create chat completion <https://platform.openai.com/docs/api-re
     $         ]
     $     }'
 
-Alternatively, you can use the `openai` python package:
+Alternatively, you can use the ``openai`` python package:
 
 .. code-block:: python
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c328c049b430c..2399fcf5faec9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -134,6 +134,7 @@ Documentation
    :caption: Developer Documentation
 
    dev/sampling_params
+   dev/pooling_params
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index a47902ab4fc9d..ac6405b9807a8 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -185,7 +185,7 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
       --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
 .. important::
-    Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
+    Since OpenAI Vision API is based on `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_,
     a chat template is **required** to launch the API server.
 
     Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
@@ -243,6 +243,10 @@ To consume the server, you can use the OpenAI client like in the example below:
 
 A full code example can be found in `examples/openai_api_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_api_client_for_multimodal.py>`_.
 
+.. tip::
+    There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+    In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
+
 .. note::
 
     By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
@@ -251,5 +255,49 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p
 
         $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
-.. note::
-    There is no need to format the prompt in the API request since it will be handled by the server.
+Chat Embeddings API
+^^^^^^^^^^^^^^^^^^^
+
+vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
+where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
+
+.. tip::
+    The schema of ``messages`` is exactly the same as in Chat Completions API.
+
+In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
+
+.. code-block:: bash
+
+    vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+      --trust-remote-code --max-model-len 4096
+
+.. important::
+
+    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
+    to run this model in embedding mode instead of text generation mode.
+
+Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+
+.. code-block:: python
+
+    import requests
+
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "TIGER-Lab/VLM2Vec-Full",
+            "messages": [{
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Represent the given image."},
+                ],
+            }],
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+    print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a1f93a9a28578..0b5f75caf2475 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -26,13 +26,26 @@ print(completion.choices[0].message)
 ```
 
 ## API Reference
-Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except:
-- Chat: `tools`, and `tool_choice`.
-- Completions: `suffix`.
 
-vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
+We currently support the following OpenAI APIs:
+
+- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+  - *Note: `suffix` parameter is not supported.*
+- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst).
+    - *Note: `image_url.detail` parameter is not supported.*
+  - We also support `audio_url` content type for audio files.
+    - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
+    - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
+  - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
+  - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
+    which will be treated as a single prompt to the model according to its chat template.
+    - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst).
+  - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
 
 ## Extra Parameters
+
 vLLM supports a set of parameters that are not part of the OpenAI API.
 In order to use them, you can pass them as extra parameters in the OpenAI client.
 Or directly merge them into the JSON payload if you are using HTTP call directly.
@@ -49,7 +62,26 @@ completion = client.chat.completions.create(
 )
 ```
 
-### Extra Parameters for Chat API
+### Extra Parameters for Completions API
+
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-sampling-params
+:end-before: end-completion-sampling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-extra-params
+:end-before: end-completion-extra-params
+```
+
+### Extra Parameters for Chat Completions API
+
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
@@ -66,21 +98,22 @@ The following extra parameters are supported:
 :end-before: end-chat-completion-extra-params
 ```
 
-### Extra Parameters for Completions API
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+### Extra Parameters for Embeddings API
+
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
-:start-after: begin-completion-sampling-params
-:end-before: end-completion-sampling-params
+:start-after: begin-embedding-pooling-params
+:end-before: end-embedding-pooling-params
 ```
 
 The following extra parameters are supported:
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
-:start-after: begin-completion-extra-params
-:end-before: end-completion-extra-params
+:start-after: begin-embedding-extra-params
+:end-before: end-embedding-extra-params
 ```
 
 ## Chat Template
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index d3aea533b6db9..4616f363cc04a 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,7 +1,6 @@
 from http import HTTPStatus
 from typing import List
 
-import openai
 import pytest
 import pytest_asyncio
 import requests
@@ -83,10 +82,8 @@ async def client(server):
     indirect=True,
 )
 @pytest.mark.asyncio
-async def test_show_version(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
-    response = requests.get(base_url + "/version")
+async def test_show_version(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("version"))
     response.raise_for_status()
 
     assert response.json() == {"version": VLLM_VERSION}
@@ -102,9 +99,7 @@ async def test_show_version(client: openai.AsyncOpenAI):
     indirect=True,
 )
 @pytest.mark.asyncio
-async def test_check_health(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
-    response = requests.get(base_url + "/health")
+async def test_check_health(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
 
     assert response.status_code == HTTPStatus.OK
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f119c6c1201c9..9f2b77dde2a7f 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -4,14 +4,18 @@
 import openai
 import pytest
 import pytest_asyncio
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 
-EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 
 
 @pytest.fixture(scope="module")
-def embedding_server():
+def server():
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -19,31 +23,29 @@ def embedding_server():
         "--enforce-eager",
         "--max-model-len",
         "8192",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
     ]
 
-    with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
 @pytest_asyncio.fixture
-async def embedding_client(embedding_server):
-    async with embedding_server.get_async_client() as async_client:
+async def client(server):
+    async with server.get_async_client() as async_client:
         yield async_client
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
-                                model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
 
     # test single embedding
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
@@ -57,7 +59,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
 
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
@@ -71,18 +73,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
-                               model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
     # test List[str]
     input_texts = [
         "The cat sat on the mat.", "A feline was resting on a rug.",
         "Stars twinkle brightly in the night sky."
     ]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
@@ -90,11 +88,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
     assert embeddings.id is not None
     assert len(embeddings.data) == 3
     assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 32
+    assert embeddings.usage.total_tokens == 32
 
     # test List[List[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
@@ -108,22 +109,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_embedding(server: RemoteOpenAIServer,
+                                      client: openai.AsyncOpenAI,
+                                      model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(server.url_for("v1/embeddings"),
+                                  json={
+                                      "model": model_name,
+                                      "messages": messages,
+                                      "encoding_format": "float",
+                                  })
+    chat_response.raise_for_status()
+    chat_embeddings = chat_response.json()
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completion_response = await client.embeddings.create(
+        model=model_name,
+        input=prompt,
+        encoding_format="float",
+        # To be consistent with chat
+        extra_body={"add_special_tokens": False},
+    )
+    completion_embeddings = completion_response.model_dump(mode="json")
+
+    assert chat_embeddings.pop("id") is not None
+    assert completion_embeddings.pop("id") is not None
+    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
+        "created")
+    assert chat_embeddings == completion_embeddings
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
                                       model_name: str):
     input_texts = [
         "Hello my name is",
         "The best thing about vLLM is that it supports many different models"
     ]
 
-    responses_float = await embedding_client.embeddings.create(
-        input=input_texts, model=model_name, encoding_format="float")
+    responses_float = await client.embeddings.create(input=input_texts,
+                                                     model=model_name,
+                                                     encoding_format="float")
 
-    responses_base64 = await embedding_client.embeddings.create(
-        input=input_texts, model=model_name, encoding_format="base64")
+    responses_base64 = await client.embeddings.create(input=input_texts,
+                                                      model=model_name,
+                                                      encoding_format="base64")
 
     decoded_responses_base64_data = []
     for data in responses_base64.data:
@@ -137,8 +186,8 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
         1]
 
     # Default response is float32 decoded from base64 by OpenAI Client
-    responses_default = await embedding_client.embeddings.create(
-        input=input_texts, model=model_name)
+    responses_default = await client.embeddings.create(input=input_texts,
+                                                       model=model_name)
 
     assert responses_float.data[0].embedding == responses_default.data[
         0].embedding
@@ -147,18 +196,15 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding_truncation(
-        embedding_client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
+                                           model_name: str):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
 
     # test single embedding
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         extra_body={"truncate_prompt_tokens": 10})
@@ -173,7 +219,7 @@ async def test_single_embedding_truncation(
         1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
         9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
     ]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         extra_body={"truncate_prompt_tokens": 10})
@@ -187,18 +233,15 @@ async def test_single_embedding_truncation(
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding_truncation_invalid(
-        embedding_client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
+                                                   model_name: str):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
 
     with pytest.raises(openai.BadRequestError):
-        embeddings = await embedding_client.embeddings.create(
+        embeddings = await client.embeddings.create(
             model=model_name,
             input=input_texts,
             extra_body={"truncate_prompt_tokens": 8193})
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 6cb74eb78cbf0..b3f1fea91d13e 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -79,9 +79,8 @@ async def client(server):
 
 
 @pytest.mark.asyncio
-async def test_metrics_counts(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
+async def test_metrics_counts(server: RemoteOpenAIServer,
+                              client: openai.AsyncClient):
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -89,7 +88,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
             prompt=_TOKENIZED_PROMPT,
             max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
 
-    response = requests.get(base_url + "/metrics")
+    response = requests.get(server.url_for("metrics"))
     print(response.text)
     assert response.status_code == HTTPStatus.OK
 
@@ -170,16 +169,15 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_metrics_exist(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
+async def test_metrics_exist(server: RemoteOpenAIServer,
+                             client: openai.AsyncClient):
     # sending a request triggers the metrics to be logged.
     await client.completions.create(model=MODEL_NAME,
                                     prompt="Hello, my name is",
                                     max_tokens=5,
                                     temperature=0.0)
 
-    response = requests.get(base_url + "/metrics")
+    response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
     for metric in EXPECTED_METRICS:
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 859a676a9c777..b1956a8cbc9dc 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,4 +1,3 @@
-import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import requests
@@ -55,9 +54,11 @@ async def client(server):
     [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
     indirect=["tokenizer_name"],
 )
-async def test_tokenize_completions(client: openai.AsyncOpenAI,
-                                    model_name: str, tokenizer_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
+async def test_tokenize_completions(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
     tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                               tokenizer_mode="fast")
 
@@ -65,7 +66,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
         prompt = "vllm1 This is a test prompt."
         tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
 
-        response = requests.post(base_url + "/tokenize",
+        response = requests.post(server.url_for("tokenize"),
                                  json={
                                      "add_special_tokens": add_special,
                                      "model": model_name,
@@ -86,9 +87,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
     [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
     indirect=["tokenizer_name"],
 )
-async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
-                             tokenizer_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
+async def test_tokenize_chat(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
     tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                               tokenizer_mode="fast")
 
@@ -121,7 +124,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
                 tokens = tokenizer.encode(prompt,
                                           add_special_tokens=add_special)
 
-                response = requests.post(base_url + "/tokenize",
+                response = requests.post(server.url_for("tokenize"),
                                          json={
                                              "add_generation_prompt":
                                              add_generation,
@@ -146,17 +149,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
     [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
     indirect=["tokenizer_name"],
 )
-async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
-                          tokenizer_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
+async def test_detokenize(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
     tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                               tokenizer_mode="fast")
 
     prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
 
-    print(f"CALLING {base_url} FOR {model_name}")
-    response = requests.post(base_url + "/detokenize",
+    response = requests.post(server.url_for("detokenize"),
                              json={
                                  "model": model_name,
                                  "tokens": tokens
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
new file mode 100644
index 0000000000000..73a69da32e434
--- /dev/null
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -0,0 +1,94 @@
+from typing import Dict
+
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
+MAXIMUM_IMAGES = 2
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embedding",
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> Dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
+                               image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "Represent the given image."
+            },
+        ],
+    }]
+
+    response = requests.post(server.url_for("v1/embeddings"),
+                             json={
+                                 "model": model_name,
+                                 "messages": messages,
+                                 "encoding_format": "float"
+                             })
+    response.raise_for_status()
+
+    embeddings = response.json()
+    assert embeddings["id"] is not None
+    assert len(embeddings["data"]) == 1
+    assert len(embeddings["data"][0]["embedding"]) == 3072
+    assert embeddings["usage"]["completion_tokens"] == 0
+    assert embeddings["usage"]["prompt_tokens"] == 771
+    assert embeddings["usage"]["total_tokens"] == 771
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 46c92e10b360c..95fd56d916050 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -11,7 +11,7 @@
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import AsyncIterator, Set
+from typing import AsyncIterator, Optional, Set
 
 import uvloop
 from fastapi import APIRouter, FastAPI, Request
@@ -51,7 +51,7 @@
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -248,20 +248,25 @@ def mount_metrics(app: FastAPI):
     app.routes.append(metrics_route)
 
 
-def chat(request: Request) -> OpenAIServingChat:
+def base(request: Request) -> OpenAIServing:
+    # Reuse the existing instance
+    return tokenization(request)
+
+
+def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
 
 
-def completion(request: Request) -> OpenAIServingCompletion:
+def completion(request: Request) -> Optional[OpenAIServingCompletion]:
     return request.app.state.openai_serving_completion
 
 
-def tokenization(request: Request) -> OpenAIServingTokenization:
-    return request.app.state.openai_serving_tokenization
+def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
+    return request.app.state.openai_serving_embedding
 
 
-def embedding(request: Request) -> OpenAIServingEmbedding:
-    return request.app.state.openai_serving_embedding
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
 
 
 def engine_client(request: Request) -> EngineClient:
@@ -277,7 +282,9 @@ async def health(raw_request: Request) -> Response:
 
 @router.post("/tokenize")
 async def tokenize(request: TokenizeRequest, raw_request: Request):
-    generator = await tokenization(raw_request).create_tokenize(request)
+    handler = tokenization(raw_request)
+
+    generator = await handler.create_tokenize(request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -289,7 +296,9 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
 @router.post("/detokenize")
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
-    generator = await tokenization(raw_request).create_detokenize(request)
+    handler = tokenization(raw_request)
+
+    generator = await handler.create_detokenize(request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -301,7 +310,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
-    models = await completion(raw_request).show_available_models()
+    handler = base(raw_request)
+
+    models = await handler.show_available_models()
     return JSONResponse(content=models.model_dump())
 
 
@@ -314,9 +325,12 @@ async def show_version():
 @router.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
+    handler = chat(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Chat Completions API")
 
-    generator = await chat(raw_request).create_chat_completion(
-        request, raw_request)
+    generator = await handler.create_chat_completion(request, raw_request)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
@@ -330,8 +344,12 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 @router.post("/v1/completions")
 async def create_completion(request: CompletionRequest, raw_request: Request):
-    generator = await completion(raw_request).create_completion(
-        request, raw_request)
+    handler = completion(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Completions API")
+
+    generator = await handler.create_completion(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -343,8 +361,12 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
 @router.post("/v1/embeddings")
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
-    generator = await embedding(raw_request).create_embedding(
-        request, raw_request)
+    handler = embedding(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Embeddings API")
+
+    generator = await handler.create_embedding(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -382,30 +404,26 @@ async def stop_profile(raw_request: Request):
     @router.post("/v1/load_lora_adapter")
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
-        response = await chat(raw_request).load_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
-
-        response = await completion(raw_request).load_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
+        for route in [chat, completion, embedding]:
+            handler = route(raw_request)
+            if handler is not None:
+                response = await handler.load_lora_adapter(request)
+                if isinstance(response, ErrorResponse):
+                    return JSONResponse(content=response.model_dump(),
+                                        status_code=response.code)
 
         return Response(status_code=200, content=response)
 
     @router.post("/v1/unload_lora_adapter")
     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
                                   raw_request: Request):
-        response = await chat(raw_request).unload_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
-
-        response = await completion(raw_request).unload_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
+        for route in [chat, completion, embedding]:
+            handler = route(raw_request)
+            if handler is not None:
+                response = await handler.unload_lora_adapter(request)
+                if isinstance(response, ErrorResponse):
+                    return JSONResponse(content=response.model_dump(),
+                                        status_code=response.code)
 
         return Response(status_code=200, content=response)
 
@@ -501,7 +519,8 @@ def init_app_state(
         chat_template=args.chat_template,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
-        tool_parser=args.tool_call_parser)
+        tool_parser=args.tool_call_parser,
+    ) if model_config.task == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
@@ -510,13 +529,14 @@ def init_app_state(
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-    )
+    ) if model_config.task == "generate" else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
         base_model_paths,
         request_logger=request_logger,
-    )
+        chat_template=args.chat_template,
+    ) if model_config.task == "embedding" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 60fc5ac8d11d2..1335e51bd152c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -708,7 +708,7 @@ def validate_stream_options(cls, data):
         return data
 
 
-class EmbeddingRequest(OpenAIBaseModel):
+class EmbeddingCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/embeddings
     model: str
@@ -720,10 +720,15 @@ class EmbeddingRequest(OpenAIBaseModel):
 
     # doc: begin-embedding-pooling-params
     additional_data: Optional[Any] = None
-
     # doc: end-embedding-pooling-params
 
     # doc: begin-embedding-extra-params
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
+    )
     priority: int = Field(
         default=0,
         description=(
@@ -737,6 +742,82 @@ def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
 
 
+class EmbeddingChatRequest(OpenAIBaseModel):
+    model: str
+    messages: List[ChatCompletionMessageParam]
+
+    encoding_format: Literal["float", "base64"] = "float"
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+    # doc: begin-chat-embedding-pooling-params
+    additional_data: Optional[Any] = None
+    # doc: end-chat-embedding-pooling-params
+
+    # doc: begin-chat-embedding-extra-params
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=
+        ("If this is set, the chat will be formatted so that the final "
+         "message in the chat is open-ended, without any EOS tokens. The "
+         "model will continue this message rather than starting a new one. "
+         "This allows you to \"prefill\" part of the model's response for it. "
+         "Cannot be used at the same time as `add_generation_prompt`."),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."))
+    # doc: end-chat-embedding-extra-params
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get(
+                "add_generation_prompt"):
+            raise ValueError("Cannot set both `continue_final_message` and "
+                             "`add_generation_prompt` to True.")
+        return data
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
+
+EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
+
+
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -799,7 +880,7 @@ class EmbeddingResponseData(OpenAIBaseModel):
 
 
 class EmbeddingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
     object: str = "list"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index f5249a0c447b3..a64467a311523 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -217,13 +217,14 @@ async def main(args):
         prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
-    )
+    ) if model_config.task == "generate" else None
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
         base_model_paths,
         request_logger=request_logger,
-    )
+        chat_template=None,
+    ) if model_config.task == "embedding" else None
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
@@ -240,14 +241,31 @@ async def main(args):
 
         # Determine the type of request and run it.
         if request.url == "/v1/chat/completions":
-            response_futures.append(
-                run_request(openai_serving_chat.create_chat_completion,
-                            request, tracker))
+            handler_fn = (None if openai_serving_chat is None else
+                          openai_serving_chat.create_chat_completion)
+            if handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg=
+                        "The model does not support Chat Completions API",
+                    ))
+                continue
+
+            response_futures.append(run_request(handler_fn, request, tracker))
             tracker.submitted()
         elif request.url == "/v1/embeddings":
-            response_futures.append(
-                run_request(openai_serving_embedding.create_embedding, request,
-                            tracker))
+            handler_fn = (None if openai_serving_embedding is None else
+                          openai_serving_embedding.create_embedding)
+            if handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Embeddings API",
+                    ))
+                continue
+
+            response_futures.append(run_request(handler_fn, request, tracker))
             tracker.submitted()
         else:
             response_futures.append(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1f951d15a7a32..9551b4f2091dd 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -10,11 +10,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (ConversationMessage,
-                                         apply_hf_chat_template,
-                                         apply_mistral_chat_template,
-                                         load_chat_template,
-                                         parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -27,16 +23,12 @@
 from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
                                                     LoRAModulePath,
                                                     OpenAIServing,
-                                                    PromptAdapterPath,
-                                                    TextTokensPrompt)
+                                                    PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
-from vllm.tracing import (contains_trace_headers, extract_trace_headers,
-                          log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import iterate_with_cancellation
 
@@ -94,12 +86,12 @@ async def create_chat_completion(
         raw_request: Optional[Request] = None,
     ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
                ErrorResponse]:
-        """Completion API similar to OpenAI's API.
+        """
+        Chat Completion API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/chat/create
         for the API specification. This API mimics the OpenAI
-        ChatCompletion API.
-
+        Chat Completion API.
         """
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
@@ -118,143 +110,106 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            model_config = self.model_config
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
-            conversation, mm_data_future = parse_chat_messages_futures(
-                request.messages, model_config, tokenizer)
+            tool_parser = self.tool_parser
+
+            # validation for OpenAI tools
+            # tool_choice = "required" is not supported
+            if request.tool_choice == "required":
+                return self.create_error_response(
+                    "tool_choice = \"required\" is not supported!")
+
+            if (request.tool_choice == "auto" and
+                    not (self.enable_auto_tools and tool_parser is not None)
+                    and not isinstance(tokenizer, MistralTokenizer)):
+                # for hf tokenizers, "auto" tools requires
+                # --enable-auto-tool-choice and --tool-call-parser
+                return self.create_error_response(
+                    "\"auto\" tool choice requires "
+                    "--enable-auto-tool-choice and --tool-call-parser to be set"
+                )
 
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
             ]
 
-            prompt: Union[str, List[int]]
-            is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
-            if is_mistral_tokenizer:
-                prompt = apply_mistral_chat_template(
-                    tokenizer,
-                    messages=request.messages,
-                    chat_template=request.chat_template or self.chat_template,
-                    add_generation_prompt=request.add_generation_prompt,
-                    continue_final_message=request.continue_final_message,
-                    tools=tool_dicts,
-                    documents=request.documents,
-                    **(request.chat_template_kwargs or {}),
-                )
-            else:
-                prompt = apply_hf_chat_template(
-                    tokenizer,
-                    conversation=conversation,
-                    chat_template=request.chat_template or self.chat_template,
-                    add_generation_prompt=request.add_generation_prompt,
-                    continue_final_message=request.continue_final_message,
-                    tools=tool_dicts,
-                    documents=request.documents,
-                    **(request.chat_template_kwargs or {}),
-                )
-        except Exception as e:
-            logger.exception("Error in applying chat template from request")
-            return self.create_error_response(str(e))
-
-        try:
-            mm_data = await mm_data_future
-        except Exception as e:
-            logger.exception("Error in loading multi-modal data")
+            (
+                conversation,
+                request_prompts,
+                engine_prompts,
+            ) = await self._preprocess_chat(
+                request,
+                tokenizer,
+                request.messages,
+                chat_template=request.chat_template or self.chat_template,
+                add_generation_prompt=request.add_generation_prompt,
+                continue_final_message=request.continue_final_message,
+                tool_dicts=tool_dicts,
+                documents=request.documents,
+                chat_template_kwargs=request.chat_template_kwargs,
+                tool_parser=tool_parser,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
+                add_special_tokens=request.add_special_tokens,
+            )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-        # validation for OpenAI tools
-        # tool_choice = "required" is not supported
-        if request.tool_choice == "required":
-            return self.create_error_response(
-                "tool_choice = \"required\" is not supported!")
-
-        if not is_mistral_tokenizer and request.tool_choice == "auto" and not (
-                self.enable_auto_tools and self.tool_parser is not None):
-            # for hf tokenizers, "auto" tools requires
-            # --enable-auto-tool-choice and --tool-call-parser
-            return self.create_error_response(
-                "\"auto\" tool choice requires "
-                "--enable-auto-tool-choice and --tool-call-parser to be set")
-
-        request_id = f"chat-{request.request_id}"
+        request_id = f"chatcmpl-{request.request_id}"
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[RequestOutput, None]] = []
         try:
-            if self.enable_auto_tools and self.tool_parser:
-                request = self.tool_parser(tokenizer).adjust_request(
-                    request=request)
-
-            if isinstance(prompt, str):
-                prompt_inputs = self._tokenize_prompt_input(
-                    request,
-                    tokenizer,
-                    prompt,
-                    truncate_prompt_tokens=request.truncate_prompt_tokens,
-                    add_special_tokens=request.add_special_tokens,
-                )
-            else:
-                assert isinstance(prompt, list) and isinstance(
-                    prompt[0], int
-                ), "Prompt has to be either a string or a list of token ids"
-                prompt_inputs = TextTokensPrompt(
-                    prompt=tokenizer.decode(prompt), prompt_token_ids=prompt)
-
-            assert prompt_inputs is not None
-
-            sampling_params: Union[SamplingParams, BeamSearchParams]
-            default_max_tokens = self.max_model_len - len(
-                prompt_inputs["prompt_token_ids"])
-            if request.use_beam_search:
-                sampling_params = request.to_beam_search_params(
-                    default_max_tokens)
-            else:
-                sampling_params = request.to_sampling_params(
-                    default_max_tokens)
-
-            self._log_inputs(request_id,
-                             prompt_inputs,
-                             params=sampling_params,
-                             lora_request=lora_request,
-                             prompt_adapter_request=prompt_adapter_request)
-
-            engine_inputs = TokensPrompt(
-                prompt_token_ids=prompt_inputs["prompt_token_ids"])
-            if mm_data is not None:
-                engine_inputs["multi_modal_data"] = mm_data
-
-            is_tracing_enabled = (await
-                                  self.engine_client.is_tracing_enabled())
-            trace_headers = None
-            if is_tracing_enabled and raw_request:
-                trace_headers = extract_trace_headers(raw_request.headers)
-            if (not is_tracing_enabled and raw_request
-                    and contains_trace_headers(raw_request.headers)):
-                log_tracing_disabled_warning()
-
-            if isinstance(sampling_params, BeamSearchParams):
-                result_generator = self.engine_client.beam_search(
-                    prompt=engine_inputs,
-                    model_config=self.model_config,
-                    request_id=request_id,
-                    params=sampling_params,
-                )
-            else:
-                result_generator = self.engine_client.generate(
-                    engine_inputs,
-                    sampling_params,
-                    request_id,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    prompt_adapter_request=prompt_adapter_request,
-                    priority=request.priority,
-                )
+            for i, engine_prompt in enumerate(engine_prompts):
+                sampling_params: Union[SamplingParams, BeamSearchParams]
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
+                if request.use_beam_search:
+                    sampling_params = request.to_beam_search_params(
+                        default_max_tokens)
+                else:
+                    sampling_params = request.to_sampling_params(
+                        default_max_tokens)
+
+                self._log_inputs(request_id,
+                                 request_prompts[i],
+                                 params=sampling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                if isinstance(sampling_params, BeamSearchParams):
+                    generator = self.engine_client.beam_search(
+                        prompt=engine_prompt,
+                        model_config=self.model_config,
+                        request_id=request_id,
+                        params=sampling_params,
+                    )
+                else:
+                    generator = self.engine_client.generate(
+                        engine_prompt,
+                        sampling_params,
+                        request_id,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        prompt_adapter_request=prompt_adapter_request,
+                        priority=request.priority,
+                    )
+
+                generators.append(generator)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
+        assert len(generators) == 1
+        result_generator, = generators
+
         if raw_request:
             result_generator = iterate_with_cancellation(
                 result_generator, raw_request.is_disconnected)
@@ -626,6 +581,9 @@ async def chat_completion_full_generator(
                 final_res = res
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
         assert final_res is not None
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index da521a6012530..570232be38379 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,7 +1,6 @@
 import asyncio
 import time
-from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
-                    Optional)
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Tuple, Union, cast
 
@@ -30,18 +29,11 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
-from vllm.tracing import (contains_trace_headers, extract_trace_headers,
-                          log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
 
-TypeTokenIDs = List[int]
-TypeTopLogProbs = List[Optional[Dict[int, float]]]
-TypeCreateLogProbsFn = Callable[
-    [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs]
-
 
 class OpenAIServingCompletion(OpenAIServing):
 
@@ -101,8 +93,6 @@ async def create_completion(
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
-        # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[RequestOutput, None]] = []
         try:
             (
                 lora_request,
@@ -111,19 +101,24 @@ async def create_completion(
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-            prompts = list(
-                self._tokenize_prompt_input_or_inputs(
-                    request,
-                    tokenizer,
-                    request.prompt,
-                    truncate_prompt_tokens=request.truncate_prompt_tokens,
-                    add_special_tokens=request.add_special_tokens,
-                ))
+            request_prompts, engine_prompts = self._preprocess_completion(
+                request,
+                tokenizer,
+                request.prompt,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
+                add_special_tokens=request.add_special_tokens,
+            )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
-            for i, prompt_inputs in enumerate(prompts):
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[RequestOutput, None]] = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
-                    prompt_inputs["prompt_token_ids"])
+                    engine_prompt["prompt_token_ids"])
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
                         default_max_tokens)
@@ -134,36 +129,24 @@ async def create_completion(
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
-                                 prompt_inputs,
+                                 request_prompts[i],
                                  params=sampling_params,
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                is_tracing_enabled = (await
-                                      self.engine_client.is_tracing_enabled())
-                trace_headers = None
-                if is_tracing_enabled:
-                    trace_headers = extract_trace_headers(raw_request.headers)
-                if not is_tracing_enabled and contains_trace_headers(
-                        raw_request.headers):
-                    log_tracing_disabled_warning()
+                trace_headers = (await
+                                 self._get_trace_headers(raw_request.headers))
 
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.engine_client.beam_search(
-                        prompt={
-                            "prompt_token_ids":
-                            prompt_inputs["prompt_token_ids"]
-                        },
+                        prompt=engine_prompt,
                         model_config=self.model_config,
                         request_id=request_id,
                         params=sampling_params,
                     )
                 else:
                     generator = self.engine_client.generate(
-                        {
-                            "prompt_token_ids":
-                            prompt_inputs["prompt_token_ids"]
-                        },
+                        engine_prompt,
                         sampling_params,
                         request_id_item,
                         lora_request=lora_request,
@@ -180,6 +163,8 @@ async def create_completion(
         result_generator = merge_async_iterators(
             *generators, is_cancelled=raw_request.is_disconnected)
 
+        num_prompts = len(engine_prompts)
+
         # Similar to the OpenAI API, when n != best_of, we do not stream the
         # results. In addition, we do not stream the results when use
         # beam search.
@@ -195,16 +180,22 @@ async def create_completion(
                 request_id,
                 created_time,
                 model_name,
-                num_prompts=len(prompts),
+                num_prompts=num_prompts,
                 tokenizer=tokenizer,
                 request_metadata=request_metadata)
 
         # Non-streaming response
-        final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
+        final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
+        try:
             for i, final_res in enumerate(final_res_batch):
                 assert final_res is not None
 
@@ -212,7 +203,7 @@ async def create_completion(
                 # We did not pass it into vLLM engine to avoid being redundant
                 # with the inputs token IDs
                 if final_res.prompt is None:
-                    final_res.prompt = prompts[i]["prompt"]
+                    final_res.prompt = request_prompts[i]["prompt"]
 
             final_res_batch_checked = cast(List[RequestOutput],
                                            final_res_batch)
@@ -226,8 +217,6 @@ async def create_completion(
                 tokenizer,
                 request_metadata,
             )
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 6c46aae2838f6..917856cd2b2dd 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -9,8 +9,10 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (EmbeddingRequest,
+from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
+                                              EmbeddingRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse, UsageInfo)
@@ -21,8 +23,6 @@
 
 logger = init_logger(__name__)
 
-TypeTokenIDs = List[int]
-
 
 def _get_embedding(
     output: EmbeddingOutput,
@@ -76,6 +76,7 @@ def __init__(
         base_model_paths: List[BaseModelPath],
         *,
         request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
@@ -83,21 +84,20 @@ def __init__(
                          lora_modules=None,
                          prompt_adapters=None,
                          request_logger=request_logger)
-        self._enabled = self._check_embedding_mode(
-            model_config.task == "embedding")
+
+        self.chat_template = load_chat_template(chat_template)
 
     async def create_embedding(
         self,
         request: EmbeddingRequest,
         raw_request: Optional[Request] = None,
     ) -> Union[EmbeddingResponse, ErrorResponse]:
-        """Completion API similar to OpenAI's API.
+        """
+        Embedding API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/embeddings/create
         for the API specification. This API mimics the OpenAI Embedding API.
         """
-        if not self._enabled:
-            return self.create_error_response("Embedding API disabled")
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -122,8 +122,6 @@ async def create_embedding(
                     "greater than max_model_len."
                     " Please, select a smaller truncation size.")
 
-        # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
         try:
             (
                 lora_request,
@@ -132,32 +130,60 @@ async def create_embedding(
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-            pooling_params = request.to_pooling_params()
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for embedding models")
+
+            if isinstance(request, EmbeddingChatRequest):
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            else:
+                request_prompts, engine_prompts = self._preprocess_completion(
+                    request,
+                    tokenizer,
+                    request.input,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
-            prompts = list(
-                self._tokenize_prompt_input_or_inputs(request, tokenizer,
-                                                      request.input,
-                                                      truncate_prompt_tokens))
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
+        try:
+            pooling_params = request.to_pooling_params()
 
-            for i, prompt_inputs in enumerate(prompts):
+            for i, engine_prompt in enumerate(engine_prompts):
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
-                                 prompt_inputs,
+                                 request_prompts[i],
                                  params=pooling_params,
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                if prompt_adapter_request is not None:
-                    raise NotImplementedError(
-                        "Prompt adapter is not supported "
-                        "for embedding models")
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
 
                 generator = self.engine_client.encode(
-                    {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
+                    engine_prompt,
                     pooling_params,
                     request_id_item,
                     lora_request=lora_request,
+                    trace_headers=trace_headers,
                     priority=request.priority,
                 )
 
@@ -171,13 +197,18 @@ async def create_embedding(
             is_cancelled=raw_request.is_disconnected if raw_request else None,
         )
 
+        num_prompts = len(engine_prompts)
+
         # Non-streaming response
         final_res_batch: List[Optional[EmbeddingRequestOutput]]
-        final_res_batch = [None] * len(prompts)
+        final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
 
+        try:
             for final_res in final_res_batch:
                 assert final_res is not None
 
@@ -187,18 +218,8 @@ async def create_embedding(
             response = request_output_to_embedding_response(
                 final_res_batch_checked, request_id, created_time, model_name,
                 encoding_format)
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
         return response
-
-    def _check_embedding_mode(self, embedding_mode: bool) -> bool:
-        if not embedding_mode:
-            logger.warning(
-                "embedding_mode is False. Embedding API will not work.")
-        else:
-            logger.info("Activating the server engine with embedding enabled.")
-        return embedding_mode
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 22a01b3dc4cc0..e7aeac8f8c018 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -2,28 +2,38 @@
 import pathlib
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union
+from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
+                    Optional, Sequence, Tuple, TypedDict, Union)
 
 from pydantic import Field
+from starlette.datastructures import Headers
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ConversationMessage,
+                                         apply_hf_chat_template,
+                                         apply_mistral_chat_template,
+                                         parse_chat_messages_futures)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest,
                                               DetokenizeRequest,
-                                              EmbeddingRequest, ErrorResponse,
+                                              EmbeddingChatRequest,
+                                              EmbeddingCompletionRequest,
+                                              ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
-                                              TokenizeRequest,
                                               UnloadLoraAdapterRequest)
+from vllm.entrypoints.openai.tool_parsers import ToolParser
 # yapf: enable
+from vllm.inputs import TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -31,8 +41,10 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import AtomicCounter
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import AtomicCounter, is_list_of
 
 logger = init_logger(__name__)
 
@@ -56,8 +68,14 @@ class LoRAModulePath:
     base_model_name: Optional[str] = None
 
 
-AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest,
-                   EmbeddingRequest, TokenizeRequest]
+CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
+                              EmbeddingCompletionRequest,
+                              TokenizeCompletionRequest]
+
+ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
+                        TokenizeChatRequest]
+
+AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest]
 
 
 class TextTokensPrompt(TypedDict):
@@ -65,6 +83,9 @@ class TextTokensPrompt(TypedDict):
     prompt_token_ids: List[int]
 
 
+RequestPrompt = Union[List[int], str, TextTokensPrompt]
+
+
 class OpenAIServing:
 
     def __init__(
@@ -246,7 +267,8 @@ def _validate_input(
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest doesn't have max_tokens
-        if isinstance(request, EmbeddingRequest):
+        if isinstance(request,
+                      (EmbeddingChatRequest, EmbeddingCompletionRequest)):
             if token_num > self.max_model_len:
                 raise ValueError(
                     f"This model's maximum context length is "
@@ -373,10 +395,115 @@ def _tokenize_prompt_input_or_inputs(
                     truncate_prompt_tokens=truncate_prompt_tokens,
                 )
 
+    def _preprocess_completion(
+        self,
+        request: CompletionLikeRequest,
+        tokenizer: AnyTokenizer,
+        input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]:
+        request_prompts = [
+            request_prompt
+            for request_prompt in self._tokenize_prompt_input_or_inputs(
+                request,
+                tokenizer,
+                input_or_inputs,
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens,
+            )
+        ]
+
+        engine_prompts = [
+            TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"])
+            for request_prompt in request_prompts
+        ]
+
+        return request_prompts, engine_prompts
+
+    async def _preprocess_chat(
+        self,
+        request: ChatLikeRequest,
+        tokenizer: AnyTokenizer,
+        messages: List[ChatCompletionMessageParam],
+        chat_template: Optional[str] = None,
+        add_generation_prompt: bool = True,
+        continue_final_message: bool = False,
+        tool_dicts: Optional[List[Dict[str, Any]]] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
+        chat_template_kwargs: Optional[Dict[str, Any]] = None,
+        tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = False,
+    ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
+               List[TokensPrompt]]:
+        conversation, mm_data_future = parse_chat_messages_futures(
+            messages,
+            self.model_config,
+            tokenizer,
+        )
+
+        request_prompt: Union[str, List[int]]
+        is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
+        if is_mistral_tokenizer:
+            request_prompt = apply_mistral_chat_template(
+                tokenizer,
+                messages=messages,
+                chat_template=chat_template,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=continue_final_message,
+                tools=tool_dicts,
+                documents=documents,
+                **(chat_template_kwargs or {}),
+            )
+        else:
+            request_prompt = apply_hf_chat_template(
+                tokenizer,
+                conversation=conversation,
+                chat_template=chat_template,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=continue_final_message,
+                tools=tool_dicts,
+                documents=documents,
+                **(chat_template_kwargs or {}),
+            )
+
+        mm_data = await mm_data_future
+
+        if tool_parser is not None:
+            if not isinstance(request, ChatCompletionRequest):
+                msg = "Tool usage is only supported for Chat Completions API"
+                raise NotImplementedError(msg)
+
+            request = tool_parser(tokenizer).adjust_request(request=request)
+
+        if isinstance(request_prompt, str):
+            prompt_inputs = self._tokenize_prompt_input(
+                request,
+                tokenizer,
+                request_prompt,
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens,
+            )
+        else:
+            # For MistralTokenizer
+            assert is_list_of(request_prompt, int), (
+                "Prompt has to be either a string or a list of token ids")
+            prompt_inputs = TextTokensPrompt(
+                prompt=tokenizer.decode(request_prompt),
+                prompt_token_ids=request_prompt)
+
+        engine_prompt = TokensPrompt(
+            prompt_token_ids=prompt_inputs["prompt_token_ids"])
+        if mm_data is not None:
+            engine_prompt["multi_modal_data"] = mm_data
+
+        return conversation, [request_prompt], [engine_prompt]
+
     def _log_inputs(
         self,
         request_id: str,
-        inputs: Union[str, List[int], TextTokensPrompt],
+        inputs: RequestPrompt,
         params: Optional[Union[SamplingParams, PoolingParams,
                                BeamSearchParams]],
         lora_request: Optional[LoRARequest],
@@ -404,6 +531,20 @@ def _log_inputs(
             prompt_adapter_request=prompt_adapter_request,
         )
 
+    async def _get_trace_headers(
+        self,
+        headers: Headers,
+    ) -> Optional[Mapping[str, str]]:
+        is_tracing_enabled = await self.engine_client.is_tracing_enabled()
+
+        if is_tracing_enabled:
+            return extract_trace_headers(headers)
+
+        if contains_trace_headers(headers):
+            log_tracing_disabled_warning()
+
+        return None
+
     @staticmethod
     def _get_decoded_token(logprob: Logprob,
                            token_id: int,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index a269c94c7ec0d..1fd82304f7a4d 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -2,10 +2,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
-                                         apply_mistral_chat_template,
-                                         load_chat_template,
-                                         parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -20,7 +17,6 @@
                                                     LoRAModulePath,
                                                     OpenAIServing)
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -62,59 +58,51 @@ async def create_tokenize(
 
         request_id = f"tokn-{random_uuid()}"
 
-        (
-            lora_request,
-            prompt_adapter_request,
-        ) = self._maybe_get_adapters(request)
-
-        tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
-        prompt: Union[str, List[int]]
-        if isinstance(request, TokenizeChatRequest):
-            model_config = self.model_config
-
-            conversation, mm_data_future = parse_chat_messages_futures(
-                request.messages, model_config, tokenizer)
-
-            mm_data = await mm_data_future
-            if mm_data:
-                logger.warning(
-                    "Multi-modal inputs are ignored during tokenization")
-
-            if isinstance(tokenizer, MistralTokenizer):
-                prompt = apply_mistral_chat_template(
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if isinstance(request, TokenizeChatRequest):
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
                     tokenizer,
-                    messages=request.messages,
+                    request.messages,
                     chat_template=self.chat_template,
                     add_generation_prompt=request.add_generation_prompt,
                     continue_final_message=request.continue_final_message,
+                    add_special_tokens=request.add_special_tokens,
                 )
             else:
-                prompt = apply_hf_chat_template(
+                request_prompts, engine_prompts = self._preprocess_completion(
+                    request,
                     tokenizer,
-                    conversation=conversation,
-                    chat_template=self.chat_template,
-                    add_generation_prompt=request.add_generation_prompt,
-                    continue_final_message=request.continue_final_message,
+                    request.prompt,
+                    add_special_tokens=request.add_special_tokens,
                 )
-        else:
-            prompt = request.prompt
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
-        self._log_inputs(request_id,
-                         prompt,
-                         params=None,
-                         lora_request=lora_request,
-                         prompt_adapter_request=prompt_adapter_request)
+        input_ids: List[int] = []
+        for i, engine_prompt in enumerate(engine_prompts):
+            self._log_inputs(request_id,
+                             request_prompts[i],
+                             params=None,
+                             lora_request=lora_request,
+                             prompt_adapter_request=prompt_adapter_request)
 
-        # Silently ignore prompt adapter since it does not affect tokenization
+            # Silently ignore prompt adapter since it does not affect
+            # tokenization (Unlike in Embeddings API where an error is raised)
 
-        prompt_input = self._tokenize_prompt_input(
-            request,
-            tokenizer,
-            prompt,
-            add_special_tokens=request.add_special_tokens,
-        )
-        input_ids = prompt_input["prompt_token_ids"]
+            input_ids.extend(engine_prompt["prompt_token_ids"])
 
         return TokenizeResponse(tokens=input_ids,
                                 count=len(input_ids),
@@ -143,9 +131,8 @@ async def create_detokenize(
                          lora_request=lora_request,
                          prompt_adapter_request=prompt_adapter_request)
 
-        if prompt_adapter_request is not None:
-            raise NotImplementedError("Prompt adapter is not supported "
-                                      "for tokenization")
+        # Silently ignore prompt adapter since it does not affect tokenization
+        # (Unlike in Embeddings API where an error is raised)
 
         prompt_input = self._tokenize_prompt_input(
             request,
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 7461fb51989c6..2635c0bccd1c4 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """Pooling parameters for pooling.
+    """Pooling parameters for embeddings API.
 
     Attributes:
         additional_data: Any additional data needed for pooling.
@@ -16,7 +16,7 @@ class PoolingParams(
 
     def clone(self) -> "PoolingParams":
         """Returns a deep copy of the PoolingParams instance."""
-        return PoolingParams(additional_data=self.additional_data, )
+        return PoolingParams(additional_data=self.additional_data)
 
     def __repr__(self) -> str:
         return (f"PoolingParams("

From 30a2e8074246e11a1452ab5e84a7be65ecac6119 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 1 Nov 2024 09:55:29 -0400
Subject: [PATCH 09/43] [CI/Build] Add Model Tests for PixtralHF (#9813)

---
 tests/models/decoder_only/vision_language/test_models.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index d738647c91b66..e49ea6f98324d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -291,6 +291,15 @@
     #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
     #     num_logprobs=10,
     # ),
+    "pixtral_hf": VLMTestInfo(
+        models=["nm-testing/pixtral-12b-FP8-dynamic"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
+        img_idx_to_prompt=lambda idx: "[IMG]",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+    ),
     "qwen": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

From ba0d8920742597269745f3551eb97b1b19f5e582 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 22:09:07 +0800
Subject: [PATCH 10/43] [Frontend] Use a proper chat template for VLM2Vec
 (#9912)

---
 docs/source/models/vlm.rst                    | 14 +++++---
 ..._chat_completion_client_for_multimodal.py} |  0
 ...ai_chat_embedding_client_for_multimodal.py | 33 +++++++++++++++++++
 examples/template_vlm2vec.jinja               | 16 +++++++++
 .../openai/test_vision_embedding.py           | 11 +++++--
 vllm/entrypoints/chat_utils.py                | 15 ++++++---
 6 files changed, 78 insertions(+), 11 deletions(-)
 rename examples/{openai_api_client_for_multimodal.py => openai_chat_completion_client_for_multimodal.py} (100%)
 create mode 100644 examples/openai_chat_embedding_client_for_multimodal.py
 create mode 100644 examples/template_vlm2vec.jinja

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index ac6405b9807a8..3377502a6db28 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -240,8 +240,7 @@ To consume the server, you can use the OpenAI client like in the example below:
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 
-
-A full code example can be found in `examples/openai_api_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_api_client_for_multimodal.py>`_.
+A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
 
 .. tip::
     There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
@@ -269,14 +268,19 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
 .. code-block:: bash
 
     vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
-      --trust-remote-code --max-model-len 4096
+      --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 
 .. important::
 
     Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
     to run this model in embedding mode instead of text generation mode.
 
-Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+.. important::
+
+    VLM2Vec does not expect chat-based input. We use a `custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`_
+    to combine the text and images together.
+
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
 
 .. code-block:: python
 
@@ -301,3 +305,5 @@ Since this schema is not defined by OpenAI client, we post a request to the serv
     response.raise_for_status()
     response_json = response.json()
     print("Embedding output:", response_json["data"][0]["embedding"])
+
+A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_api_client_for_multimodal.py
rename to examples/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
new file mode 100644
index 0000000000000..effb588e1387f
--- /dev/null
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -0,0 +1,33 @@
+import requests
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+response = requests.post(
+    "http://localhost:8000/v1/embeddings",
+    json={
+        "model":
+        "TIGER-Lab/VLM2Vec-Full",
+        "messages": [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Represent the given image."
+                },
+            ],
+        }],
+        "encoding_format":
+        "float",
+    },
+)
+response.raise_for_status()
+response_json = response.json()
+
+print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja
new file mode 100644
index 0000000000000..489b99604af38
--- /dev/null
+++ b/examples/template_vlm2vec.jinja
@@ -0,0 +1,16 @@
+{%- if messages | length > 1 -%}
+    {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+
+{% set vars = namespace(parts=[], next_image_id=1) %}
+{%- for message in messages -%}
+    {%- for content in message['content'] -%}
+        {%- if content['type'] == 'text' -%}
+            {%- set vars.parts = vars.parts + [content['text']] %}
+        {%- elif content['type'] == 'image' -%}
+            {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
+            {%- set vars.next_image_id = vars.next_image_id + 1 %}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endfor -%}
+{{ vars.parts | join(' ') }}
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 73a69da32e434..d0c43b47bf0af 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -6,11 +6,14 @@
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
 
+vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
+assert vlm2vec_jinja_path.exists()
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -35,6 +38,8 @@ def server():
         "--trust-remote-code",
         "--limit-mm-per-prompt",
         f"image={MAXIMUM_IMAGES}",
+        "--chat-template",
+        str(vlm2vec_jinja_path),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -90,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings["data"]) == 1
     assert len(embeddings["data"][0]["embedding"]) == 3072
     assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 771
-    assert embeddings["usage"]["total_tokens"] == 771
+    assert embeddings["usage"]["prompt_tokens"] == 762
+    assert embeddings["usage"]["total_tokens"] == 762
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ce36f20760f4c..bc2de2d162473 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -156,6 +156,10 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
 
         self._items: List[_T] = []
 
+    @property
+    def model_config(self) -> ModelConfig:
+        return self._model_config
+
     @staticmethod
     @lru_cache(maxsize=None)
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -491,10 +495,13 @@ def _parse_chat_message_content_parts(
     content: List[Union[str, Dict[str, str]]] = []
 
     mm_parser = mm_tracker.create_parser()
-    wrap_dicts = \
-        mm_tracker._model_config.hf_config.model_type in \
-            MODEL_KEEP_MULTI_MODAL_CONTENT or \
-        (chat_template_text_format == "openai")
+    model_config = mm_tracker.model_config
+
+    wrap_dicts = (chat_template_text_format == "openai"
+                  or (model_config.task == "embedding"
+                      and model_config.is_multimodal_model)
+                  or (model_config.hf_config.model_type
+                      in MODEL_KEEP_MULTI_MODAL_CONTENT))
 
     for part in parts:
         parse_res = _parse_chat_message_content_part(

From 1dd4cb2935fc3fff9c156b5772d18e0a0d1861f0 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 1 Nov 2024 11:33:15 -0600
Subject: [PATCH 11/43] [Bugfix] Fix edge cases for MistralTokenizer (#9625)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/tokenization/test_detokenize.py         | 80 +++++++++++++++----
 vllm/transformers_utils/tokenizers/mistral.py | 64 ++++++++++-----
 2 files changed, 105 insertions(+), 39 deletions(-)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index f4551ed42efb8..1d07885349409 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Generator, List, Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -7,11 +7,17 @@
 from vllm.transformers_utils.detokenizer import (Detokenizer,
                                                  detokenize_incrementally)
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 TRUTH = [
     "Hello here, this is a simple test",
     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
-    "我很感谢你的热情"
+    "我很感谢你的热情",
+    # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
+    # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
+    # incomplete UTF-8 characters
+    # see https://github.com/vllm-project/vllm/pull/9625
+    "ပုံပြင်လေးပြောပြပါ်",
 ]
 TOKENIZERS = [
     "facebook/opt-125m",
@@ -24,6 +30,7 @@
     "tiiuae/falcon-7b",
     "meta-llama/Llama-2-7b-hf",
     "codellama/CodeLlama-7b-hf",
+    "mistralai/Pixtral-12B-2409",
 ]
 
 
@@ -49,15 +56,55 @@ def _run_incremental_decode(tokenizer, all_input_ids,
     return decoded_text
 
 
+@pytest.fixture
+def tokenizer(tokenizer_name):
+    return (MistralTokenizer.from_pretrained(tokenizer_name)
+            if "mistral" in tokenizer_name else
+            AutoTokenizer.from_pretrained(tokenizer_name))
+
+
+@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
+@pytest.mark.parametrize(
+    "truth",
+    [
+        # Burmese text triggers an edge-case where tokens may map to bytes with
+        # incomplete UTF-8 characters
+        "ပုံပြင်လေးပြောပြပါ",
+        # Using "URGENCY" since "CY" has token id 130282
+        "URGENCY🌶️",
+    ])
+def test_mistral_edge_case(tokenizer, truth):
+    """Test for a specific edge cases with V3-Tekken MistralTokenizer.
+
+    See https://github.com/vllm-project/vllm/pull/9625
+    """
+    starting_index = 0
+    all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
+
+    decoded_text = _run_incremental_decode(tokenizer,
+                                           all_input_ids,
+                                           skip_special_tokens=True,
+                                           starting_index=starting_index)
+    assert decoded_text == truth
+
+
+@pytest.fixture
+def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
+    if "mistral" in tokenizer_name:
+        yield (
+            bool(True) if request.param else
+            pytest.skip("mistral doesn't support skip_special_tokens=False"))
+    else:
+        yield bool(True) if request.param else bool(False)
+
+
 @pytest.mark.parametrize("truth", TRUTH)
 @pytest.mark.parametrize("with_prompt", [True, False])
-@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", (True, False))
-def test_decode_streaming(tokenizer_id, truth, with_prompt,
-                          skip_special_tokens):
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
     if with_prompt:
-        truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"]
+        truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
         prompt_input_ids = truth_tokens[:len(truth) // 2]
         generated_input_ids = truth_tokens[len(truth) // 2:]
         all_input_ids = prompt_input_ids + generated_input_ids
@@ -68,7 +115,7 @@ def test_decode_streaming(tokenizer_id, truth, with_prompt,
     else:
         generated = truth
         starting_index = 0
-        all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
+        all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
     if skip_special_tokens:
         if tokenizer.bos_token_id is not None:
             all_input_ids = [tokenizer.bos_token_id] + all_input_ids
@@ -98,7 +145,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
         enable_lora=False,
         max_num_seqs=100,
         max_input_length=None,
-        tokenizer_mode="auto",
+        tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
         trust_remote_code=False,
         revision=None,
     )
@@ -113,9 +160,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
-                                       tokenizer_name: str) -> List[int]:
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"]
+                                       tokenizer) -> List[int]:
+    complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
     return complete_sequence_token_ids
 
 
@@ -150,7 +196,7 @@ def create_dummy_prompt_logprobs(
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
 def test_decode_sequence_logprobs(complete_sequence: str,
                                   complete_sequence_token_ids: List[int],
                                   detokenizer: Detokenizer,
@@ -208,9 +254,9 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
 
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
-    tokenzier = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = tokenzier.decode(token_ids, skip_special_tokens=True)
-    text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True)
+    tokenizer = detokenizer.get_tokenizer_for_seq(seq)
+    text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
+    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
     text = text_full[len(text_first):]
 
     # Text for logprobs for the chosen token should be the same as the
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 80e21c2d32ecc..896f70bc1dafd 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -16,9 +16,13 @@
 from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
                                                      Tekkenizer)
 
+from vllm.logger import init_logger
+
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class Encoding:
@@ -72,20 +76,21 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
             # Make sure special tokens will not raise
             tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
 
-            self._vocab = {
-                token: idx
-                for idx, token in enumerate(tokenizer_.vocab())
-            }
         elif isinstance(tokenizer_, SentencePieceTokenizer):
-            self._vocab = {
-                token: idx
-                for idx, token in enumerate(tokenizer_.vocab())
-            }
+            pass
         else:
             raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
 
+        self._vocab = tokenizer_.vocab()
+        # Convert to a Dict[str, int] to match protocol, but this is a lossy
+        # conversion. There may be multiple token ids that decode to the same
+        # string due to partial UTF-8 byte sequences being converted to �
+        self._vocab_dict = {
+            token: idx
+            for idx, token in enumerate(self._vocab)
+        }
         self.tokenizer = tokenizer_
-        self._max_token_id = max(self._vocab.values())
+        self._max_token_id = self.vocab_size - 1
 
     @classmethod
     def from_pretrained(cls,
@@ -182,7 +187,9 @@ def __call__(
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
-        return self._vocab
+        # NB: the dictionary form of the vocabulary collapses token ids that map
+        # to the same string but have different bytes
+        return self._vocab_dict
 
     def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary
@@ -220,14 +227,20 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
             if any(isinstance(t, bytes) for t in tokens):
                 # we need to encode and decode all tokens again
                 shift = self.tokenizer.num_special_tokens
-                byte_tokens = [
-                    t.encode("utf-8") if not isinstance(t, bytes) else t
-                    for t in tokens
-                ]
-                ids = [
-                    self.tokenizer._tekken_token2id_nospecial[t] + shift
-                    for t in byte_tokens
-                ]
+
+                def _token_to_id(t: str):
+                    t_bytes = t.encode("utf-8") \
+                        if not isinstance(t, bytes) else t
+                    try:
+                        return shift + \
+                            self.tokenizer._tekken_token2id_nospecial[t_bytes]
+                    except KeyError:
+                        logger.warning(
+                            "Failed to convert token %s to id,"
+                            " replacing with <unk>", t_bytes)
+                        return self.tokenizer.unk_id
+
+                ids = [_token_to_id(t) for t in tokens]
                 decoded = self.tokenizer.decode(ids)
             else:
                 decoded = "".join(tokens)
@@ -236,7 +249,13 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
 
         return decoded
 
-    def decode(self, ids: Union[List[int], int]) -> str:
+    def decode(self,
+               ids: Union[List[int], int],
+               skip_special_tokens: bool = True) -> str:
+        assert (
+            skip_special_tokens
+        ), "Skipping special tokens is not supported for Mistral tokenizers."
+
         if isinstance(ids, int):
             ids = [ids]
         return self.tokenizer.decode(ids)
@@ -257,10 +276,11 @@ def convert_ids_to_tokens(
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
-        if any(t.strip() == "�" for t in tokens):
-            # if any stripped decoded token is undefined
-            # because it's invalid unicode then pass bytes
+        if any("�" in t for t in tokens):
+            # if a decoded token contains the replacement character, then the
+            # token has an incomplete UTF-8 character so we must use bytes
             # See: https://github.com/vllm-project/vllm/pull/8640
+            #      https://github.com/vllm-project/vllm/pull/9625
             tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
 
         return tokens

From 4581d2cc02f655e76233f9cb129f07c6b65d39f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Jonasson?= <andre.jonasson@gmail.com>
Date: Fri, 1 Nov 2024 19:41:38 +0100
Subject: [PATCH 12/43] [Core] Refactor: Clean up unused argument in
 Scheduler._preempt (#9696)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: André Jonasson <andre.jonasson@gmail.com>
---
 vllm/core/scheduler.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 88733b8f53b86..e35c05f4fe7f7 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -828,8 +828,7 @@ def _schedule_priority_preemption(
                                          num_running_seqs)
 
                 #Preempt out the victim sequence group
-                self._preempt(vseq_group, blocks_to_swap_out,
-                              PreemptionMode.RECOMPUTE)
+                self._preempt(vseq_group, blocks_to_swap_out)
                 waiting_queue.appendleft(vseq_group)
                 force_preemption_count += 1
             #Put the sequence back into the waiting queue
@@ -1451,12 +1450,8 @@ def _append_slots(self,
             if len(cows) > 0:
                 blocks_to_copy.extend(cows)
 
-    def _preempt(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: List[Tuple[int, int]],
-        preemption_mode: Optional[PreemptionMode] = None,
-    ) -> PreemptionMode:
+    def _preempt(self, seq_group: SequenceGroup,
+                 blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
         # If preemption mode is not specified, we determine the mode as follows:
         # We use recomputation by default since it incurs lower overhead than
         # swapping. However, when the sequence group has multiple sequences

From aff1fd81881bf29f82ad6ba55b301828764cd120 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 1 Nov 2024 11:50:37 -0700
Subject: [PATCH 13/43] [torch.compile] use interpreter with stable api from
 pytorch (#9889)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 165 +++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 76 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 10cf49e19eccc..96ddcba467c5b 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -243,6 +243,65 @@ def split_graph(graph: fx.GraphModule,
     return split_gm, outputs
 
 
+# we share the global graph pool among all the backends
+global_graph_pool = None
+
+
+class PiecewiseCompileInterpreter(torch.fx.Interpreter):
+    """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
+    It runs the given graph with fake inputs, and compile some
+    submodules specified by `compile_submod_names` with the given
+    compilation configs.
+    """
+
+    def __init__(self, module: torch.fx.GraphModule,
+                 compile_submod_names: List[str],
+                 compilation_configs: CompilationConfig, graph_pool):
+        super().__init__(module)
+        from torch._guards import detect_fake_mode
+        self.fake_mode = detect_fake_mode()
+        self.compile_submod_names = compile_submod_names
+        self.compilation_configs = compilation_configs
+        self.graph_pool = graph_pool
+        self.have_seen_first_graph = False
+
+    def run(self, *args):
+        fake_args = [
+            self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+            for t in args
+        ]
+        return super().run(*fake_args)
+
+    def call_module(self, target: torch.fx.node.Target,
+                    args: Tuple[torch.fx.node.Argument,
+                                ...], kwargs: Dict[str, Any]) -> Any:
+        assert isinstance(target, str)
+        output = super().call_module(target, args, kwargs)
+
+        if target in self.compile_submod_names:
+            submod = self.fetch_attr(target)
+            sym_shape_indices = [
+                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
+            ]
+            compiled_graph_for_general_shape = wrap_inductor(
+                submod,
+                args,
+                self.compilation_configs.inductor_compile_config,
+                runtime_shape=None,
+                do_logging=not self.have_seen_first_graph,
+                use_inductor=self.compilation_configs.use_inductor)
+
+            self.module.__dict__[target] = PiecewiseBackend(
+                submod, self.compilation_configs, self.graph_pool,
+                not self.have_seen_first_graph, sym_shape_indices,
+                compiled_graph_for_general_shape)
+
+            self.have_seen_first_graph = True
+            compilation_counter.num_piecewise_capturable_graphs_seen += 1
+
+        return output
+
+
 class VllmBackend:
     """The compilation backend for `torch.compile` with VLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
@@ -263,8 +322,14 @@ class VllmBackend:
     returned_callable: Callable
 
     def __init__(self, ):
-        # every instance of VllmBackend has its own graph pool
-        self.graph_pool = torch.cuda.graph_pool_handle()
+        global global_graph_pool
+        if global_graph_pool is None:
+            global_graph_pool = torch.cuda.graph_pool_handle()
+
+        # TODO: in the future, if we want to use multiple
+        # streams, it might not be safe to share a global pool.
+        # only investigate this when we use multiple streams
+        self.graph_pool = global_graph_pool
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
@@ -286,55 +351,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         self.split_gm, self.piecewise_graphs = split_graph(
             graph, self.compilation_configs.non_cudagraph_ops)
 
-        returned_callable: Callable  # type: ignore
+        from torch._dynamo.utils import lazy_format_graph_code
+        logger.debug("%s",
+                     lazy_format_graph_code("stiching module", self.split_gm))
 
-        if len(self.piecewise_graphs) == 0:
-            compilation_counter.num_piecewise_graphs_seen += 1
-            compilation_counter.num_piecewise_capturable_graphs_seen += 1
-            returned_callable = PiecewiseBackend(graph,
-                                                 self.compilation_configs,
-                                                 self.graph_pool,
-                                                 is_first_graph=True)
-        else:
-            from torch._dynamo.utils import lazy_format_graph_code
-            logger.debug(
-                "%s", lazy_format_graph_code("stiching module", self.split_gm))
-
-            is_first_graph = True
-
-            for item in self.piecewise_graphs:
-                compilation_counter.num_piecewise_graphs_seen += 1
-                compilation_counter.num_piecewise_capturable_graphs_seen += not item.is_splitting_graph  # noqa
-                if not item.is_splitting_graph:
-                    # cannot setattr to a module, so we need to set
-                    # the attribute in the __dict__
-                    self.split_gm.__dict__[
-                        item.submod_name] = PiecewiseBackend(
-                            item.graph, self.compilation_configs,
-                            self.graph_pool, is_first_graph)
-                    is_first_graph = False
-            returned_callable = self.split_gm
-
-        self.returned_callable = returned_callable
-        # trigger the first compilation
-        # code borrowed from https://github.com/pytorch/pytorch/blob/4e3e08b71171fa34172b2362ff668553fac75f27/torch/_dynamo/backends/distributed.py#L206 # noqa
-        # to turn the inputs into fake tensors
-        import torch._guards
-        from torch._guards import detect_fake_mode
-        fake_mode = detect_fake_mode(example_inputs)
-        fake_args = []
-        for arg in example_inputs:
-            if isinstance(arg, torch.Tensor) and not isinstance(
-                    arg, torch._subclasses.FakeTensor):
-                fake_args.append(
-                    torch._dynamo.utils.to_fake_tensor(arg, fake_mode))
-            else:
-                fake_args.append(arg)
-        self.returned_callable(*fake_args)
+        compilation_counter.num_piecewise_graphs_seen += len(
+            self.piecewise_graphs)
+        submod_names_to_compile = [
+            item.submod_name for item in self.piecewise_graphs
+            if not item.is_splitting_graph
+        ]
+
+        # propagate the split graph to the piecewise backend,
+        # compile submodules with symbolic shapes
+        PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
+                                    self.compilation_configs,
+                                    self.graph_pool).run(*example_inputs)
 
         self._called = True
 
-        return self.returned_callable
+        return self.split_gm
 
 
 @dataclasses.dataclass
@@ -352,11 +388,10 @@ class ConcreteSizeEntry:
 
 class PiecewiseBackend:
 
-    def __init__(self,
-                 graph: fx.GraphModule,
-                 compilation_configs: CompilationConfig,
-                 graph_pool: Any,
-                 is_first_graph: bool = False):
+    def __init__(self, graph: fx.GraphModule,
+                 compilation_configs: CompilationConfig, graph_pool: Any,
+                 is_first_graph: bool, sym_shape_indices: List[int],
+                 compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
         It mainly handles the compilation and cudagraph capturing.
@@ -381,12 +416,11 @@ def __init__(self,
             self.compilation_configs.capture_sizes
         ) if self.compilation_configs.use_cudagraph else set()
 
-        self.compile_finished = False
         self.first_run_finished = False
 
-        self.compiled_graph_for_general_shape: Callable = None  # type: ignore
+        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
 
-        self.sym_shape_indices: List[int] = []
+        self.sym_shape_indices = sym_shape_indices
 
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
@@ -399,27 +433,6 @@ def __init__(self,
             )
 
     def __call__(self, *args) -> Any:
-
-        if not self.compile_finished:
-            self.compile_finished = True
-
-            # this is the first compilation, we will compile a graph with
-            # dynamic shape, as the caller will mark first dimension as dynamic
-
-            self.sym_shape_indices = [
-                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
-            ]
-
-            self.compiled_graph_for_general_shape = wrap_inductor(
-                self.graph,
-                args,
-                self.compilation_configs.inductor_compile_config,
-                runtime_shape=None,
-                do_logging=self.is_first_graph,
-                use_inductor=self.compilation_configs.use_inductor)
-
-            return self.graph(*args)
-
         if not self.first_run_finished:
             self.first_run_finished = True
             return self.compiled_graph_for_general_shape(*args)

From 598b6d7b070149aae5884aa8b17a0c91c93172f5 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Fri, 1 Nov 2024 12:15:05 -0700
Subject: [PATCH 14/43] [Bugfix/Core] Flashinfer k_scale and v_scale (#9861)

---
 tests/kernels/test_cache.py                   | 21 ++++++++++++-------
 vllm/attention/backends/flashinfer.py         |  9 +++++---
 .../layers/quantization/modelopt.py           |  7 +++++--
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 5b8311a33c361..e2b4778b94b9e 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -258,19 +258,20 @@ def test_reshape_and_cache_flash(
     del key_caches
     del value_caches
 
+    k_scale = key.amax().item() / 256
+    v_scale = value.amax().item() / 256
+
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
         cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_key_cache, key_cache)
+        ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype)
         cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_value_cache, value_cache)
+        ops.convert_fp8(cloned_value_cache, value_cache, v_scale,
+                        kv_cache_dtype)
     else:
         cloned_key_cache = key_cache.clone()
         cloned_value_cache = value_cache.clone()
 
-    # Using default kv_scale
-    k_scale = v_scale = 1.0
-
     # Call the reshape_and_cache kernel.
     opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
             (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
@@ -281,9 +282,15 @@ def test_reshape_and_cache_flash(
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(result_key_cache, key_cache)
+        ops.convert_fp8(result_key_cache,
+                        key_cache,
+                        k_scale,
+                        kv_dtype=kv_cache_dtype)
         result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(result_value_cache, value_cache)
+        ops.convert_fp8(result_value_cache,
+                        value_cache,
+                        v_scale,
+                        kv_dtype=kv_cache_dtype)
 
     # Run the reference implementation.
     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 234c87d5c4edb..658805d35be0a 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -759,8 +759,6 @@ def forward(
         v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
-        assert k_scale == 1.0 and v_scale == 1.0, (
-            "key/v_scale is not supported in FlashInfer.")
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
@@ -874,7 +872,12 @@ def unified_flash_infer(
             assert prefill_meta is not None
             assert prefill_meta.prefill_wrapper is not None
             prefill_output = prefill_meta.prefill_wrapper.forward(
-                query, kv_cache, logits_soft_cap=logits_soft_cap, causal=True)
+                query,
+                kv_cache,
+                logits_soft_cap=logits_soft_cap,
+                causal=True,
+                k_scale=k_scale,
+                v_scale=v_scale)
     if decode_meta := attn_metadata.decode_metadata:
         assert attn_metadata.decode_metadata is not None
         assert attn_metadata.decode_metadata.decode_wrapper is not None
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index dc5f47eb9b0fb..9694f2b8208e2 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -141,8 +141,11 @@ def create_weights(
             layer.register_parameter("input_scale", scale)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        max_w_scale, weight = requantize_with_max_scale(
-            layer.weight, layer.weight_scale, layer.logical_widths)
+        weight = layer.weight
+        max_w_scale = layer.weight_scale.max()
+        if not (layer.weight_scale == layer.weight_scale[0]).all():
+            max_w_scale, weight = requantize_with_max_scale(
+                layer.weight, layer.weight_scale, layer.logical_widths)
         layer.weight = Parameter(weight.t(), requires_grad=False)
         layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
         layer.input_scale = Parameter(layer.input_scale.max(),

From 18bd7587b78b3b9868fea29d59ae8c3600c3e5a5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 1 Nov 2024 13:51:57 -0700
Subject: [PATCH 15/43] [1/N] pass the complete config from engine to executor
 (#9933)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/engine/async_llm_engine.py       |  2 +-
 vllm/engine/llm_engine.py             | 50 +++++++++------------
 vllm/engine/multiprocessing/engine.py |  7 +--
 vllm/executor/executor_base.py        | 37 ++++++----------
 vllm/executor/xpu_executor.py         | 44 ++++---------------
 vllm/v1/engine/llm_engine.py          | 62 +++++++++------------------
 6 files changed, 65 insertions(+), 137 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5198467a6ac40..6aeaf484a22b4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -680,7 +680,7 @@ def from_engine_args(
 
         # Create the async LLM engine.
         engine = cls(
-            **engine_config.to_dict(),
+            vllm_config=engine_config,
             executor_class=executor_class,
             log_requests=not engine_args.disable_log_requests,
             log_stats=not engine_args.disable_log_stats,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index edef1f30a9e91..e6fe1effb8287 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -13,11 +13,8 @@
 from typing_extensions import TypeIs, TypeVar
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
-                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
@@ -222,17 +219,7 @@ def validate_outputs(
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        decoding_config: Optional[DecodingConfig],
-        observability_config: Optional[ObservabilityConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
+        vllm_config: EngineConfig,
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -240,6 +227,22 @@ def __init__(
         input_registry: InputRegistry = INPUT_REGISTRY,
         use_cached_outputs: bool = False,
     ) -> None:
+
+        # TODO: remove the local variables and use self.* throughout the class.
+        model_config = self.model_config = vllm_config.model_config
+        cache_config = self.cache_config = vllm_config.cache_config
+        lora_config = self.lora_config = vllm_config.lora_config
+        parallel_config = self.parallel_config = vllm_config.parallel_config
+        scheduler_config = self.scheduler_config = vllm_config.scheduler_config
+        device_config = self.device_config = vllm_config.device_config
+        speculative_config = self.speculative_config = vllm_config.speculative_config  # noqa
+        load_config = self.load_config = vllm_config.load_config
+        decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        )
+        prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
+        observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
+        )
+
         logger.info(
             "Initializing an LLM engine (v%s) with config: "
             "model=%r, speculative_config=%r, tokenizer=%r, "
@@ -340,18 +343,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
-        self.model_executor = executor_class(
-            model_config=model_config,
-            cache_config=cache_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            lora_config=lora_config,
-            speculative_config=speculative_config,
-            load_config=load_config,
-            prompt_adapter_config=prompt_adapter_config,
-            observability_config=self.observability_config,
-        )
+        self.model_executor = executor_class(vllm_config=vllm_config, )
 
         if self.model_config.task != "embedding":
             self._initialize_kv_caches()
@@ -582,7 +574,7 @@ def from_engine_args(
         executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
-            **engine_config.to_dict(),
+            vllm_config=engine_config,
             executor_class=executor_class,
             log_stats=not engine_args.disable_log_stats,
             usage_context=usage_context,
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 0a7f430eca488..eb1512ca17822 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -7,8 +7,6 @@
 import zmq
 
 from vllm import AsyncEngineArgs, SamplingParams
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
@@ -30,9 +28,6 @@
 else:
     from vllm.engine.llm_engine import LLMEngine
 
-CONFIG_TYPE = Union[ModelConfig, DecodingConfig, ParallelConfig,
-                    SchedulerConfig, LoRAConfig]
-
 logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 10000
@@ -130,7 +125,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
 
         return cls(ipc_path=ipc_path,
                    use_async_sockets=use_async_sockets,
-                   **engine_config.to_dict(),
+                   vllm_config=engine_config,
                    executor_class=executor_class,
                    log_requests=not engine_args.disable_log_requests,
                    log_stats=not engine_args.disable_log_stats,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index c96cb0f2c2981..2248eecd1849f 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,10 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import EngineConfig
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -23,27 +20,19 @@ class ExecutorBase(ABC):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
-        observability_config: Optional[ObservabilityConfig],
+        vllm_config: EngineConfig,
     ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.speculative_config = speculative_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
         self._init_executor()
 
     @abstractmethod
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index 5f78993ddc4b4..36b7e2265efab 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -2,10 +2,7 @@
 
 import torch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import ModelConfig, ParallelConfig
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
@@ -21,38 +18,13 @@ class XPUExecutor(GPUExecutor):
 
     uses_ray: bool = False
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        observability_config: Optional[ObservabilityConfig],
-    ) -> None:
-        assert device_config.device_type == "xpu"
-        assert (not speculative_config
-                ), "Speculative decoding not yet supported for XPU backend"
-
-        model_config = _verify_and_get_model_config(model_config)
-
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.load_config = load_config
-        self.lora_config = lora_config
-        self.parallel_config = _verify_and_get_parallel_config(parallel_config)
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.speculative_config = None
-        self.observability_config = observability_config
-
-        # Instantiate the worker and load the model to GPU.
-        self._init_executor()
+    def _init_executor(self) -> None:
+        assert self.device_config.device_type == "xpu"
+        assert self.speculative_config is None, (
+            "Speculative decoding not yet supported for XPU backend")
+
+        self.model_config = _verify_and_get_model_config(self.model_config)
+        GPUExecutor._init_executor(self)
 
     def _get_worker_module_and_class(
             self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 072e52bcd686a..febabd2f31036 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,11 +2,8 @@
 from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type,
                     Union)
 
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
-                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
@@ -35,17 +32,7 @@ class LLMEngine:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        decoding_config: Optional[DecodingConfig],
-        observability_config: Optional[ObservabilityConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
+        vllm_config: EngineConfig,
         executor_class: Type[GPUExecutor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -53,6 +40,22 @@ def __init__(
         input_registry: InputRegistry = INPUT_REGISTRY,
         use_cached_outputs: bool = False,
     ) -> None:
+
+        # TODO: remove the local variables and use self.* throughout the class.
+        model_config = self.model_config = vllm_config.model_config
+        cache_config = self.cache_config = vllm_config.cache_config
+        lora_config = self.lora_config = vllm_config.lora_config
+        parallel_config = self.parallel_config = vllm_config.parallel_config
+        scheduler_config = self.scheduler_config = vllm_config.scheduler_config
+        device_config = self.device_config = vllm_config.device_config
+        speculative_config = self.speculative_config = vllm_config.speculative_config  # noqa
+        load_config = self.load_config = vllm_config.load_config
+        decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        )
+        prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
+        observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
+        )
+
         # Override the configs for V1.
         # FIXME
         if usage_context == UsageContext.LLM_CLASS:
@@ -112,18 +115,6 @@ def __init__(
             model_config.mm_processor_kwargs,
         )
 
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.speculative_config = speculative_config
-        self.load_config = load_config
-        self.decoding_config = decoding_config or DecodingConfig()
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config or ObservabilityConfig(
-        )
         self.log_stats = log_stats
 
         assert not self.model_config.skip_tokenizer_init
@@ -154,18 +145,7 @@ def __init__(
         # Request id -> RequestOutput
         self.request_outputs: Dict[str, RequestOutput] = {}
 
-        self.model_executor = executor_class(
-            model_config=model_config,
-            cache_config=cache_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            lora_config=lora_config,
-            speculative_config=speculative_config,
-            load_config=load_config,
-            prompt_adapter_config=prompt_adapter_config,
-            observability_config=self.observability_config,
-        )
+        self.model_executor = executor_class(vllm_config=vllm_config)
         assert self.model_config.task != "embedding"
         self._initialize_kv_caches()
 
@@ -203,7 +183,7 @@ def from_engine_args(
         executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
-            **engine_config.to_dict(),
+            vllm_config=engine_config,
             executor_class=executor_class,
             log_stats=not engine_args.disable_log_stats,
             usage_context=usage_context,

From 27cd36e6e2e808464c8343066b03db5db2d15413 Mon Sep 17 00:00:00 2001
From: Gene Der Su <gdsu@ucdavis.edu>
Date: Fri, 1 Nov 2024 15:08:23 -0700
Subject: [PATCH 16/43] [Bugfix] PicklingError on RayTaskError (#9934)

Signed-off-by: Gene Su <e870252314@gmail.com>
---
 vllm/engine/multiprocessing/engine.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index eb1512ca17822..a73b4c825b11c 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -5,6 +5,7 @@
 
 import cloudpickle
 import zmq
+from ray.exceptions import RayTaskError
 
 from vllm import AsyncEngineArgs, SamplingParams
 # yapf conflicts with isort for this block
@@ -305,6 +306,11 @@ def _health_check(self):
     def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
         """Send List of RequestOutput to RPCClient."""
         if outputs:
+            # RayTaskError might not pickelable here. We need to unpack the
+            # underlying exception as the real exception in the output.
+            if (isinstance(outputs, RPCError)
+                    and isinstance(outputs.exception, RayTaskError)):
+                outputs.exception = outputs.exception.cause
             output_bytes = pickle.dumps(outputs)
             self.output_socket.send_multipart((output_bytes, ), copy=False)
 

From d151fde8341d34592e1e5e14d2152d067421cf63 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 1 Nov 2024 23:04:42 +0000
Subject: [PATCH 17/43] [ci/build] Bump the patch-update group with 10 updates
 (#9897)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
---
 requirements-lint.txt |  2 +-
 requirements-test.in  |  2 +-
 requirements-test.txt | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements-lint.txt b/requirements-lint.txt
index 07f738873e1a8..f9132bbf96437 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,7 +1,7 @@
 # formatting
 yapf==0.32.0
 toml==0.10.2
-tomli==2.0.1
+tomli==2.0.2
 ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
diff --git a/requirements-test.in b/requirements-test.in
index 3881f2566b556..5d44664c082a6 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -32,6 +32,6 @@ aiohttp
 
 # quantization
 bitsandbytes>=0.44.0
-buildkite-test-collector==0.1.8
+buildkite-test-collector==0.1.9
 
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index c474c2ec34b22..7477b7c3a79cd 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -36,20 +36,20 @@ attrs==24.2.0
     #   referencing
 audioread==3.0.1
     # via librosa
-awscli==1.35.16
+awscli==1.35.19
     # via -r requirements-test.in
 bitsandbytes==0.44.1
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
-boto3==1.35.50
+boto3==1.35.53
     # via tensorizer
-botocore==1.35.50
+botocore==1.35.53
     # via
     #   awscli
     #   boto3
     #   s3transfer
-buildkite-test-collector==0.1.8
+buildkite-test-collector==0.1.9
     # via -r requirements-test.in
 certifi==2024.8.30
     # via
@@ -426,7 +426,7 @@ requests==2.32.3
     #   transformers
 rouge-score==0.1.2
     # via lm-eval
-rpds-py==0.20.0
+rpds-py==0.20.1
     # via
     #   jsonschema
     #   referencing
@@ -552,7 +552,7 @@ xxhash==3.5.0
     # via
     #   datasets
     #   evaluate
-yarl==1.17.0
+yarl==1.17.1
     # via aiohttp
 zstandard==0.23.0
     # via lm-eval

From 6c0b7f548d80b5f61bfa472ad1497597c922dbc2 Mon Sep 17 00:00:00 2001
From: Peter Salas <peter@fixie.ai>
Date: Fri, 1 Nov 2024 16:21:10 -0700
Subject: [PATCH 18/43] [Core][VLM] Add precise multi-modal placeholder
 tracking (#8346)

Signed-off-by: Peter Salas <peter@fixie.ai>
---
 examples/offline_inference_audio_language.py  |   6 +-
 tests/kernels/utils.py                        |   2 +
 .../audio_language/test_ultravox.py           |  91 ++++++--
 tests/multimodal/test_processor_kwargs.py     |  14 +-
 tests/multimodal/test_utils.py                |  57 ++++-
 tests/worker/test_model_input.py              |   3 +
 vllm/attention/backends/abstract.py           |  11 +
 vllm/attention/backends/blocksparse_attn.py   |   3 +
 vllm/attention/backends/flash_attn.py         |  20 ++
 vllm/attention/backends/flashinfer.py         |  18 ++
 vllm/attention/backends/placeholder_attn.py   |  22 +-
 vllm/attention/backends/rocm_flash_attn.py    |   3 +
 vllm/attention/backends/utils.py              |  18 ++
 vllm/attention/backends/xformers.py           |   3 +
 vllm/core/scheduler.py                        |   2 +
 vllm/inputs/__init__.py                       |   3 +-
 vllm/inputs/data.py                           |  11 +-
 vllm/inputs/registry.py                       |  40 ++--
 vllm/model_executor/models/blip.py            |  10 +-
 vllm/model_executor/models/blip2.py           |  15 +-
 vllm/model_executor/models/chameleon.py       |  22 +-
 vllm/model_executor/models/clip.py            |  32 ++-
 vllm/model_executor/models/fuyu.py            |  31 ++-
 vllm/model_executor/models/internvl.py        |   8 +-
 vllm/model_executor/models/llava.py           |  15 +-
 vllm/model_executor/models/llava_next.py      |  11 +-
 .../model_executor/models/llava_next_video.py |  25 +-
 vllm/model_executor/models/llava_onevision.py |  21 +-
 vllm/model_executor/models/minicpmv.py        |   6 +-
 vllm/model_executor/models/mllama.py          |   7 +-
 vllm/model_executor/models/paligemma.py       |   8 +-
 vllm/model_executor/models/phi3v.py           |   8 +-
 vllm/model_executor/models/pixtral.py         |  34 ++-
 vllm/model_executor/models/qwen.py            |  10 +-
 vllm/model_executor/models/qwen2_audio.py     |  15 +-
 vllm/model_executor/models/qwen2_vl.py        |  11 +-
 vllm/model_executor/models/siglip.py          |  24 +-
 vllm/model_executor/models/ultravox.py        |  60 ++---
 vllm/model_executor/models/utils.py           |  18 +-
 vllm/multimodal/__init__.py                   |   7 +-
 vllm/multimodal/base.py                       | 214 +++++++++++++++++-
 vllm/multimodal/image.py                      |   8 +-
 vllm/multimodal/registry.py                   |  18 +-
 vllm/multimodal/utils.py                      |  21 +-
 vllm/multimodal/video.py                      |  14 +-
 vllm/sequence.py                              |  17 +-
 vllm/worker/cpu_model_runner.py               |  38 +++-
 vllm/worker/enc_dec_model_runner.py           |  30 +--
 vllm/worker/model_runner.py                   |  21 +-
 vllm/worker/model_runner_base.py              |   5 +-
 vllm/worker/openvino_model_runner.py          |  43 +++-
 vllm/worker/tpu_model_runner.py               |   4 +
 vllm/worker/xpu_model_runner.py               |  38 +++-
 53 files changed, 914 insertions(+), 282 deletions(-)

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 37ec667d96a77..050b791b62adb 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -34,11 +34,7 @@ def run_ultravox(question: str, audio_count: int):
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name,
-              enforce_eager=True,
-              enable_chunked_prefill=False,
-              max_model_len=8192,
-              limit_mm_per_prompt={"audio": audio_count})
+    llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index a2d414f636e13..c3d5252edc2a3 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -869,6 +869,7 @@ def make_test_metadata(
         return attn_backend.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
+            multi_modal_placeholder_index_maps=None,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
@@ -914,6 +915,7 @@ def make_test_metadata(
         return attn_backend.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=kv_mmap.slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index b9089e75ffab8..d14e88b4e5b26 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -2,8 +2,10 @@
 
 import numpy as np
 import pytest
+import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
+from tests.utils import RemoteOpenAIServer
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
@@ -17,6 +19,13 @@
 VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
 HF_PLACEHOLDER = "<|audio|>"
 
+CHUNKED_PREFILL_KWARGS = {
+    "enable_chunked_prefill": True,
+    "max_num_seqs": 2,
+    # Use a very small limit to exercise chunked prefill.
+    "max_num_batched_tokens": 16
+}
+
 
 @pytest.fixture(scope="session")
 def audio_assets():
@@ -30,6 +39,26 @@ def audio(request):
     return AudioAsset(request.param)
 
 
+@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+def server(request, audio_assets):
+    args = [
+        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
+        f"--limit-mm-per-prompt=audio={len(audio_assets)}"
+    ] + [
+        f"--{key.replace('_','-')}={value}"
+        for key, value in request.param.items()
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
 def _get_prompt(audio_count, question, placeholder):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     placeholder = f"{placeholder}\n" * audio_count
@@ -68,8 +97,7 @@ def run_test(
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    **kwargs,
 ):
     """Inference result should be the same between hf and vllm."""
     torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -79,11 +107,8 @@ def run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(model, dtype=dtype, enforce_eager=True,
+                     **kwargs) as vllm_model:
         vllm_outputs_per_audio = [
             vllm_model.generate_greedy_logprobs([vllm_prompt],
                                                 max_tokens,
@@ -135,18 +160,16 @@ def run_multi_audio_test(
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    **kwargs,
 ):
     with vllm_runner(model,
                      dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      limit_mm_per_prompt={
                          "audio":
                          max((len(audio) for _, audio in prompts_and_audios))
-                     }) as vllm_model:
+                     },
+                     **kwargs) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             [prompt for prompt, _ in prompts_and_audios],
             max_tokens,
@@ -162,8 +185,9 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
+                num_logprobs: int, vllm_kwargs: dict) -> None:
 
     vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
     hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
@@ -175,7 +199,7 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
+        **vllm_kwargs,
     )
 
 
@@ -183,9 +207,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
 def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
-                                     max_tokens: int,
-                                     num_logprobs: int) -> None:
+                                     max_tokens: int, num_logprobs: int,
+                                     vllm_kwargs: dict) -> None:
 
     vllm_prompt = _get_prompt(len(audio_assets),
                               "Describe each of the audios above.",
@@ -198,5 +223,37 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
+        **vllm_kwargs,
     )
+
+
+@pytest.mark.asyncio
+async def test_online_inference(client, audio_assets):
+    """Exercises online inference with/without chunked prefill enabled."""
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *[{
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio.url
+                }
+            } for audio in audio_assets],
+            {
+                "type":
+                "text",
+                "text":
+                f"What's happening in these {len(audio_assets)} audio clips?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10)
+
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index 5044740c3e734..4d3bbd805c152 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -5,8 +5,8 @@
 import pytest
 import torch
 
-from vllm.inputs import DecoderOnlyInputs, InputContext, token_inputs
-from vllm.inputs.registry import InputRegistry
+from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
+                         InputRegistry, token_inputs)
 from vllm.multimodal import MultiModalRegistry
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
 
@@ -56,7 +56,7 @@ def custom_dummy_data_factory(self,
                                   num_crops=DEFAULT_NUM_CROPS):
         seq_data = SequenceData(
             array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops))
-        return seq_data, None
+        return DummyData(seq_data, None)
 
     with patch(
             "vllm.inputs.registry.InputRegistry._default_dummy_data_factory",
@@ -177,9 +177,9 @@ def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops):
     # NOTE: seq_len is thrown away here since this will leverage the
     # default dummy data factory that we have patched in, whose seq
     # len is solely dependent on the value of the mm_processor_kwargs.
-    seq_data, _ = dummy_registry.dummy_data_for_profiling(
+    dummy_data = dummy_registry.dummy_data_for_profiling(
         ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(seq_data.prompt_token_ids) == expected_seq_count
+    assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count
 
 
 @pytest.mark.parametrize(
@@ -206,9 +206,9 @@ def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
     # NOTE: seq_len is thrown away here since this will leverage the
     # default dummy data factory that we have patched in, whose seq
     # len is solely dependent on the value of the mm_processor_kwargs.
-    seq_data, _ = dummy_registry.dummy_data_for_profiling(
+    dummy_data = dummy_registry.dummy_data_for_profiling(
         ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
+    assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
 
 
 ### Test overrides for the max token count per multimodal instance
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 38cd48629f903..69f04f0a69c0b 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -92,18 +92,50 @@ def test_repeat_and_pad_placeholder_tokens(model):
     tokenizer = AutoTokenizer.from_pretrained(model)
 
     test_cases = [
-        ("<image>", 2, "<image><image>", [32000, 32000]),
-        ("<image><image>", 2, "<image><image><image>", [32000, 32000, 32000]),
-        ("<image><image>", [3, 2], "<image><image><image><image><image>",
-         [32000, 32000, 32000, 32000, 32000]),
-        ("Image:<image>Image:<image>!", [3, 2],
-         "Image:<image><image><image>Image:<image><image>!",
-         [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918]),
-        ("<image>", [3, 2], "<image><image><image>", [32000, 32000, 32000]),
-    ]
-
-    for prompt, repeat_count, expected_prompt, expected_token_ids in test_cases:
-        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+        (
+            "<image>",
+            2,
+            "<image><image>",
+            [32000, 32000],
+            [{ "offset": 0, "length": 2 }],
+        ),
+        (
+            "<image><image>",
+            2,
+            "<image><image><image>",
+            [32000, 32000, 32000],
+            [{ "offset": 0, "length": 2 }]),
+        (
+            "<image><image>",
+            [3, 2],
+            "<image><image><image><image><image>",
+            [32000, 32000, 32000, 32000, 32000],
+            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
+        ),
+        (
+            "Image:<image>Image:<image>!",
+            [3, 2],
+            "Image:<image><image><image>Image:<image><image>!",
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
+        ),
+        (
+            "<image>",
+            [3, 2],
+            "<image><image><image>",
+            [32000, 32000, 32000],
+            [{ "offset": 0, "length": 3 }],
+        ),
+    ]  # yapf: disable
+
+    for (
+            prompt,
+            repeat_count,
+            expected_prompt,
+            expected_token_ids,
+            expected_ranges,
+    ) in test_cases:
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
             tokenizer=tokenizer,
             prompt=prompt,
             prompt_token_ids=tokenizer.encode(prompt,
@@ -113,3 +145,4 @@ def test_repeat_and_pad_placeholder_tokens(model):
         )
         assert new_prompt == expected_prompt
         assert new_token_ids == expected_token_ids
+        assert ranges == expected_ranges
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 1e7f560fc68cc..b36e8bfe73ff3 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -73,6 +73,7 @@ def test_model_runner_input():
         num_prefill_tokens=2,
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
     )
     model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.ones(10),
@@ -124,6 +125,7 @@ def test_embedding_model_runner_input():
         num_prefill_tokens=2,
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
     )
     model_input = ModelInputForGPUWithPoolingMetadata(
         input_tokens=torch.ones(10),
@@ -174,6 +176,7 @@ def test_multi_step_model_runner_input():
         num_prefill_tokens=2,
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
     )
     frozen_model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.ones(10),
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 9ea89eca01f5b..a504cb1f7e318 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -7,6 +7,8 @@
 
 import torch
 
+from vllm.multimodal import MultiModalPlaceholderMap
+
 if TYPE_CHECKING:
     from vllm.worker.model_runner_base import (ModelRunnerBase,
                                                ModelRunnerInputBase,
@@ -108,6 +110,15 @@ class AttentionMetadata:
     # in block 0, and 1st slot in block 1, respectively.
     slot_mapping: torch.Tensor
 
+    # The index maps that relate multi-modal embeddings to the corresponding
+    # placeholders.
+    #
+    # N.B. These aren't really related to attention and don't belong on this
+    # type -- this is just a temporary solution to make them available to
+    # `model_executable`.
+    multi_modal_placeholder_index_maps: Optional[Dict[
+        str, MultiModalPlaceholderMap.IndexMap]]
+
     @property
     @abstractmethod
     def prefill_metadata(self) -> Optional["AttentionMetadata"]:
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index c216d195c9e7e..409a42187f46c 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -215,6 +215,8 @@ def prefill_metadata(
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -243,6 +245,7 @@ def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_query_len=None,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c294fcf7f08fe..ab363ac78b028 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 """Attention layer with FlashAttention."""
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
@@ -14,6 +15,7 @@
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
 from vllm.forward_context import get_forward_context
+from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
                         make_tensor_with_pad)
 
@@ -169,6 +171,8 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -198,6 +202,7 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_decode_query_len=self.max_decode_query_len,
@@ -297,6 +302,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.context_lens: List[int] = []
         self.block_tables: List[List[int]] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -327,6 +335,12 @@ def _add_seq_group(
             self.context_lens.append(context_len)
 
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -449,6 +463,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -464,6 +483,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_decode_query_len=max_decode_query_len,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 658805d35be0a..107e3bbf79666 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,7 +1,10 @@
+from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
 
+from vllm.multimodal import MultiModalPlaceholderMap
+
 try:
     from flashinfer import BatchDecodeWithPagedKVCacheWrapper
     from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
@@ -215,6 +218,7 @@ def graph_capture_get_metadata_for_batch(
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,
             slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
             num_prefill_tokens=0,
             num_decode_tokens=batch_size,
             max_prefill_seq_len=0,
@@ -470,6 +474,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.context_lens: List[int] = []
         self.block_tables: List[List[int]] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -519,6 +526,11 @@ def _add_seq_group(
                  inter_data.curr_sliding_window_blocks):
             self.context_lens.append(context_len)
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -651,6 +663,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -694,6 +711,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             decode_query_len=decode_query_len,
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             max_prefill_seq_len=max_prefill_seq_len,
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 4116fbf00020c..888adbffb8578 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,5 +1,6 @@
+from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
 
 import torch
 
@@ -7,6 +8,7 @@
                                               AttentionMetadata,
                                               AttentionMetadataBuilder)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.multimodal import MultiModalPlaceholderMap
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUBuilder
@@ -135,6 +137,8 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_decode_query_len=0,
@@ -167,6 +171,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_decode_query_len=self.max_decode_query_len,
@@ -189,6 +194,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.prefill_seq_lens: List[int] = []
         self.context_lens: List[int] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -213,6 +221,12 @@ def _add_seq_group(
             self.context_lens.append(context_len)
 
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -280,6 +294,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -296,6 +315,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         return PlaceholderAttentionMetadata(
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 30859dfa60634..b129d0d992f2f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -150,6 +150,8 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -178,6 +180,7 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_query_len=None,
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 32fccd0dfb496..55293bbb06e1d 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,4 +1,5 @@
 """Attention backend utils"""
+from collections import defaultdict
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Dict, List, Type, TypeVar, Union
 
@@ -7,6 +8,7 @@
 
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
+from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
 if TYPE_CHECKING:
@@ -123,6 +125,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.context_lens: List[int] = []
         self.block_tables: List[List[int]] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -147,6 +152,12 @@ def _add_seq_group(
                  inter_data.curr_sliding_window_blocks):
             self.context_lens.append(context_len)
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -242,6 +253,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -254,6 +270,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         return self._metadata_cls(  # type: ignore
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
@@ -305,6 +322,7 @@ def graph_capture_get_metadata_for_batch(
             num_prefill_tokens=0,
             num_decode_tokens=batch_size,
             slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self._graph_seq_lens[:batch_size],
             max_query_len=1,
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 5aaf13d8ea744..21877f2dded0e 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -212,6 +212,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
@@ -255,6 +257,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens_tensor=seq_lens_tensor,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index e35c05f4fe7f7..e56d5cddce424 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1308,6 +1308,8 @@ def schedule(
                     # `multi_modal_data` will be None.
                     multi_modal_data=seq_group.multi_modal_data
                     if scheduler_outputs.num_prefill_groups > 0 else None,
+                    multi_modal_placeholders=seq_group.multi_modal_placeholders
+                    if scheduler_outputs.num_prefill_groups > 0 else None,
                     mm_processor_kwargs=seq_group.mm_processor_kwargs,
                     prompt_adapter_request=seq_group.prompt_adapter_request,
                 )
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 7b73922ddd2c5..ac7b3ca28b406 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -3,7 +3,7 @@
                    SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
                    build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
                    token_inputs, zip_enc_dec_prompts)
-from .registry import InputContext, InputRegistry
+from .registry import DummyData, InputContext, InputRegistry
 
 INPUT_REGISTRY = InputRegistry()
 """
@@ -29,6 +29,7 @@
     "to_enc_dec_tuple_list",
     "zip_enc_dec_prompts",
     "INPUT_REGISTRY",
+    "DummyData",
     "InputContext",
     "InputRegistry",
 ]
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 9a094191eda38..ba393cbcce4eb 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@
 from typing_extensions import NotRequired, TypedDict, TypeVar
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalDataDict
+    from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 
 
 class TextPrompt(TypedDict):
@@ -136,6 +136,12 @@ class TokenInputs(TypedDict):
     if the model supports it.
     """
 
+    multi_modal_placeholders: NotRequired[
+        Optional["MultiModalPlaceholderDict"]]
+    """
+    Placeholder ranges for the multi-modal data.
+    """
+
     mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
@@ -149,6 +155,7 @@ def token_inputs(
     prompt_token_ids: List[int],
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
+    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
     """Construct :class:`TokenInputs` from optional values."""
@@ -158,6 +165,8 @@ def token_inputs(
         inputs["prompt"] = prompt
     if multi_modal_data is not None:
         inputs["multi_modal_data"] = multi_modal_data
+    if multi_modal_placeholders is not None:
+        inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
         inputs["mm_processor_kwargs"] = mm_processor_kwargs
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 4cebc91ce715c..fbf912a212568 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,8 +1,8 @@
 import functools
 from collections import UserDict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
-                    Protocol, Tuple, Type)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
+                    Optional, Protocol, Type)
 
 from torch import nn
 from transformers import PretrainedConfig
@@ -16,7 +16,8 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
-    from vllm.multimodal import MultiModalDataDict, MultiModalRegistry
+    from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
+                                 MultiModalRegistry)
     from vllm.sequence import SequenceData
 
 logger = init_logger(__name__)
@@ -63,6 +64,14 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
 N = TypeVar("N", bound=Type[nn.Module])
 
 
+class DummyData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    seq_data: "SequenceData"
+    multi_modal_data: Optional["MultiModalDataDict"] = None
+    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
+
+
 class DummyDataFactory(Protocol):
 
     def __call__(
@@ -71,7 +80,7 @@ def __call__(
         seq_len: int,
         mm_counts: Mapping[str, int],
         **mm_processor_kwargs: Any,
-    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+    ) -> DummyData:
         """
         Create dummy data to be inputted into the model.
 
@@ -123,7 +132,7 @@ def _default_dummy_data_factory(
         ctx: InputContext,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+    ) -> DummyData:
         """
         The default dummy data factory represents the longest possible text
         that can be inputted to the model.
@@ -134,10 +143,7 @@ def _default_dummy_data_factory(
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        dummy_seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
-        dummy_multi_modal_data = None
-
-        return dummy_seq_data, dummy_multi_modal_data
+        return DummyData(SequenceData.from_prompt_token_counts((0, seq_len)))
 
     def register_dummy_data(self, factory: DummyDataFactory):
         """
@@ -195,7 +201,7 @@ def dummy_data_for_profiling(
         seq_len: int,
         mm_registry: "MultiModalRegistry",
         is_encoder_data: bool = False,
-    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+    ) -> DummyData:
         """
         Create dummy data for profiling the memory usage of a model.
 
@@ -220,12 +226,12 @@ def dummy_data_for_profiling(
         mm_processor_kwargs = get_allowed_kwarg_only_overrides(
             dummy_factory, overrides=model_config.mm_processor_kwargs)
 
-        seq_data, mm_data = dummy_factory(InputContext(model_config), seq_len,
-                                          _MultiModalCounts(mm_counts),
-                                          **mm_processor_kwargs)
+        dummy_data = dummy_factory(InputContext(model_config), seq_len,
+                                   _MultiModalCounts(mm_counts),
+                                   **mm_processor_kwargs)
 
         # Having more tokens is over-conservative but otherwise fine
-        num_tokens = seq_data.prompt_token_ids
+        num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
                 print_warning_once(
@@ -235,15 +241,15 @@ def dummy_data_for_profiling(
                 raise AssertionError(
                     f"Expected at least {seq_len} dummy tokens for profiling, "
                     f"but found {len(num_tokens)} tokens instead.")
-        if mm_data is not None:
-            for k, v in mm_data.items():
+        if dummy_data.multi_modal_data is not None:
+            for k, v in dummy_data.multi_modal_data.items():
                 num_items = len(v) if isinstance(v, list) else 1
                 num_expected = mm_counts[k]
                 assert num_items >= num_expected, (
                     f"Expected at least {num_expected} dummy '{k}' instances "
                     f"for profiling, but found {num_items} instances instead.")
 
-        return seq_data, mm_data
+        return dummy_data
 
     def _default_input_processor(
         self,
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 1f2d7384076ed..e612010677364 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -98,6 +98,11 @@ def input_processor_for_blip(
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
     if image_feature_size_override is None:
@@ -105,7 +110,7 @@ def input_processor_for_blip(
     else:
         image_feature_size = image_feature_size_override
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -116,7 +121,8 @@ def input_processor_for_blip(
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": ranges})
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index c3b3cc8a4ddb6..db1f92649bd49 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -9,13 +9,14 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .blip import (BlipVisionModel, dummy_image_for_blip,
@@ -425,7 +426,11 @@ def dummy_seq_data_for_blip2(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
@@ -434,7 +439,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
     vision_config = hf_config.vision_config
     num_images = mm_counts["image"]
 
-    seq_data = dummy_seq_data_for_blip2(
+    seq_data, ranges = dummy_seq_data_for_blip2(
         hf_config,
         seq_len,
         num_images,
@@ -444,7 +449,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
     if isinstance(vision_config, Blip2VisionConfig):
         mm_data = dummy_image_for_blip(vision_config, num_images)
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index aaf559ca386cc..9f6c6786c0fa4 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,8 +11,8 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -30,6 +30,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.utils import print_warning_once
@@ -73,7 +74,11 @@ def dummy_seq_data_for_chameleon(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_chameleon(
@@ -97,14 +102,14 @@ def dummy_data_for_chameleon(ctx: InputContext, seq_len: int,
                              mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
 
-    seq_data = dummy_seq_data_for_chameleon(
+    seq_data, ranges = dummy_seq_data_for_chameleon(
         seq_len,
         num_images,
         image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
     )
 
     mm_data = dummy_image_for_chameleon(num_images)
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 def input_processor_for_chameleon(ctx: InputContext,
@@ -120,9 +125,14 @@ def input_processor_for_chameleon(ctx: InputContext,
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index a3293020c042e..2d81b9266826b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
@@ -49,14 +50,13 @@ def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
     return get_clip_image_feature_size(hf_config)
 
 
-def dummy_seq_data_for_clip(
-    hf_config: CLIPVisionConfig,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
+def dummy_seq_data_for_clip(hf_config: CLIPVisionConfig,
+                            seq_len: int,
+                            num_images: int,
+                            *,
+                            image_token_id: int,
+                            image_feature_size_override: Optional[int] = None,
+                            mm_key: str = "image"):
     if image_feature_size_override is None:
         image_feature_size = get_clip_image_feature_size(hf_config)
     else:
@@ -65,7 +65,11 @@ def dummy_seq_data_for_clip(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        mm_key:
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_clip(
@@ -117,6 +121,11 @@ def input_processor_for_clip(
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
     if image_feature_size_override is None:
@@ -130,7 +139,7 @@ def input_processor_for_clip(
     else:
         image_feature_size = image_feature_size_override
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -141,7 +150,8 @@ def input_processor_for_clip(
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": ranges})
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 358d1dd288c49..0de590d1d8372 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -27,8 +27,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -37,9 +37,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
+from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
@@ -103,7 +105,11 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
     token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images
     token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
                        [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids)
+    return SequenceData(token_ids), {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_fuyu(
@@ -119,15 +125,15 @@ def dummy_image_for_fuyu(
 def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
                         mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
-    seq_data = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
+    seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
     mm_data = dummy_image_for_fuyu(num_images,
                                    image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
                                    image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
-                           data: Image.Image):
+                           data: List[Image.Image]):
     image_encoding = image_processor.preprocess(data, return_tensors="pt")
     batch_images = torch.stack([img[0] for img in image_encoding["images"]
                                 ]).unsqueeze(1)
@@ -158,8 +164,10 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
     model_config = ctx.model_config
     image_data = multi_modal_data["image"]
     new_multi_modal_data = {}
+    image_list = image_data if isinstance(image_data, list) else [image_data]
+
     # process image data
-    if isinstance(image_data, Image.Image):
+    if is_list_of(image_list, Image.Image):
         # Fuyu's image_processor can also finish token padding
         image_processor: FuyuImageProcessor = cached_get_image_processor(
             model_config.model)
@@ -171,7 +179,7 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
         ])
         new_multi_modal_data["image"] = image_patches
 
-    elif isinstance(image_data, torch.Tensor):
+    elif is_list_of(image_list, torch.Tensor):
         raise NotImplementedError("Embeddings input is not supported yet")
     else:
         raise TypeError(f"Invalid image type: {type(image_data)}")
@@ -198,12 +206,13 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
 
 def input_mapper_for_fuyu(ctx: InputContext, data: object):
     model_config = ctx.model_config
-    if isinstance(data, Image.Image):
+    data_list = data if isinstance(data, list) else [data]
+    if is_list_of(data_list, Image.Image):
         # Fuyu's image_processor can also finish token padding
         image_processor: FuyuImageProcessor = cached_get_image_processor(
             model_config.model)
 
-        model_image_input = _fuyu_image_preprocess(image_processor, data)
+        model_image_input = _fuyu_image_preprocess(image_processor, data_list)
         data = torch.stack([
             image_patch[0]
             for image_patch in model_image_input["image_patches"]
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 1c1fde5b30983..d2ec0ff6e74c6 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -17,8 +17,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import (AWQConfig,
                                                      QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -379,7 +379,7 @@ def dummy_data(
             model_config.tokenizer,
             trust_remote_code=model_config.trust_remote_code)
 
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             hf_config.vision_config,
             seq_len,
             num_images,
@@ -398,7 +398,7 @@ def dummy_data(
             image_height_override=max_image_height,
         )
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
 
 input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 27055e7ced865..7fbd59ebd98fd 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -10,7 +10,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -111,7 +112,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
     image_feature_size = get_max_llava_image_tokens(ctx)
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_images,
@@ -120,9 +121,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
         )
 
         mm_data = dummy_image_for_clip(vision_config, num_images)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_images,
@@ -131,9 +132,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
         )
 
         mm_data = dummy_image_for_siglip(vision_config, num_images)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, PixtralVisionConfig):
-        seq_data = dummy_seq_data_for_pixtral_hf(
+        seq_data, ranges = dummy_seq_data_for_pixtral_hf(
             vision_config,
             seq_len,
             num_images,
@@ -142,7 +143,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
         )
 
         mm_data = dummy_image_for_pixtral_hf(vision_config, num_images)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e8540d85ff565..e8c5786066170 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -12,7 +12,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig, PoolerConfig
-from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext)
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -180,7 +181,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
     max_feat_height, max_feat_width = pinpoint
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_images,
@@ -195,9 +196,9 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
             image_height_override=max_feat_height,
         )
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_images,
@@ -212,7 +213,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
             image_height_override=max_feat_height,
         )
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index b8051d5fc6ae2..b755e2347f6ed 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -11,8 +11,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -108,33 +108,35 @@ def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
     video_feature_size = frames_per_video * tokens_per_frame
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
+            mm_key="video",
         )
 
         pil_frame = dummy_image_for_clip(vision_config, num_images=1)
         np_frame = np.array(pil_frame["image"])
         mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
         mm_data = {"video": mm_data_per_video}
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
+            mm_key="video",
         )
 
         pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
         np_frame = np.array(pil_frame["image"])
         mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
         mm_data = {"video": mm_data_per_video}
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
@@ -145,6 +147,12 @@ def input_processor_for_llava_next_video(ctx: InputContext,
     multi_modal_data = inputs.get("multi_modal_data")
     if multi_modal_data is None or "video" not in multi_modal_data:
         return inputs
+
+    if "multi_modal_placeholders" in inputs and "video" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     video_data = multi_modal_data["video"]
 
     model_config = ctx.model_config
@@ -160,7 +168,7 @@ def input_processor_for_llava_next_video(ctx: InputContext,
 
         tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
-        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
             tokenizer,
             inputs.get("prompt"),
             inputs["prompt_token_ids"],
@@ -170,7 +178,8 @@ def input_processor_for_llava_next_video(ctx: InputContext,
 
         return token_inputs(prompt_token_ids=new_token_ids,
                             prompt=new_prompt,
-                            multi_modal_data=multi_modal_data)
+                            multi_modal_data=multi_modal_data,
+                            multi_modal_placeholders={"video": ranges})
 
     elif is_list_of(video_data, np.ndarray):
         raise NotImplementedError(
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index a0cf208a65f36..f410d64577a77 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -15,8 +15,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -218,31 +218,31 @@ def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int,
     video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
-        )
+            mm_key="video")
 
         mm_data = dummy_video_for_clip(vision_config,
                                        num_frames=num_frames,
                                        num_videos=num_videos)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
-        )
+            mm_key="video")
 
         mm_data = dummy_video_for_siglip(vision_config,
                                          num_frames=num_frames,
                                          num_videos=num_videos)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
@@ -320,7 +320,7 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
         video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
         tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
-        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
             tokenizer,
             inputs.get("prompt"),
             inputs["prompt_token_ids"],
@@ -330,7 +330,8 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
 
         return token_inputs(prompt_token_ids=new_token_ids,
                             prompt=new_prompt,
-                            multi_modal_data=multi_modal_data)
+                            multi_modal_data=multi_modal_data,
+                            multi_modal_placeholders={"video": ranges})
 
     elif is_list_of(video_data, np.ndarray):
         video_feature_size = []
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 4917c33136069..a526a5dccd398 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -36,8 +36,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
@@ -277,7 +277,7 @@ def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int,
     seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images)
     mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images)
 
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data)
 
 
 def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 5cf5272cae878..19c3827e43703 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -36,7 +36,7 @@
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          EncoderDecoderInputs, InputContext)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -176,13 +176,14 @@ def dummy_image(num_images: int, ):
 def dummy_decoder_data_for_mllama(ctx: InputContext, seq_len: int,
                                   mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
-    return dummy_decoder_seq_data(seq_len, num_images), None
+    return DummyData(dummy_decoder_seq_data(seq_len, num_images))
 
 
 def dummy_encoder_data_for_mllama(ctx: InputContext, seq_len: int,
                                   mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
-    return dummy_encoder_seq_data(ctx, num_images), dummy_image(num_images)
+    return DummyData(dummy_encoder_seq_data(ctx, num_images),
+                     dummy_image(num_images))
 
 
 def _prepare_aspect_ratio_attention_mask(
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 8e29c6079b994..4b6061e113cb2 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -7,8 +7,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -58,7 +58,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
     vision_config = hf_config.vision_config
     num_images = mm_counts["image"]
 
-    seq_data = dummy_seq_data_for_siglip(
+    seq_data, ranges = dummy_seq_data_for_siglip(
         vision_config,
         seq_len,
         num_images,
@@ -66,7 +66,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
     )
 
     mm_data = dummy_image_for_siglip(vision_config, num_images)
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 def input_processor_for_paligemma(ctx: InputContext,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4928e447d5b9e..5b477a8ed5f49 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -28,8 +28,8 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import (CacheConfig, ModelConfig, MultiModalConfig,
                          PoolerConfig)
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -380,7 +380,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
 
     image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops)
 
-    seq_data = dummy_seq_data_for_clip(
+    seq_data, ranges = dummy_seq_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         seq_len,
         num_images,
@@ -394,7 +394,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
         image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
 
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 @lru_cache
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6b53bf5660096..051454c49bff8 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -17,8 +17,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -28,7 +28,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import is_list_of
@@ -81,7 +82,12 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     )
 
     mm_data = {"image": num_images * [image]}
-    return seq_data, mm_data
+    mm_placeholders = {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
+    return DummyData(seq_data, mm_data, mm_placeholders)
 
 
 def input_mapper_for_pixtral(ctx: InputContext,
@@ -630,13 +636,13 @@ def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
 
 
 def dummy_seq_data_for_pixtral_hf(
-    hf_config: PixtralVisionConfig,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
+        hf_config: PixtralVisionConfig,
+        seq_len: int,
+        num_images: int,
+        *,
+        image_token_id: int,
+        image_feature_size_override: Optional[int] = None,
+        mm_key: str = "image"):
     if image_feature_size_override is None:
         image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config)
     else:
@@ -645,7 +651,11 @@ def dummy_seq_data_for_pixtral_hf(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        mm_key:
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_pixtral_hf(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 61665768eacf5..b2b5c70182135 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -23,8 +23,8 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -810,7 +810,7 @@ def dummy_data_for_qwen(
     ctx: InputContext,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> Tuple[SequenceData, Optional[Dict]]:
+) -> DummyData:
     """Build dummy data for warming up Qwen models; this will only contain text
     matching the defaults for VLLM unless the model has a visual config.
 
@@ -829,7 +829,7 @@ def dummy_data_for_qwen(
     if not hasattr(hf_config, "visual"):
         seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
         mm_data = None
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data)
 
     # We have a visual component - use images to warm up
     num_images = mm_counts["image"]
@@ -861,7 +861,7 @@ def dummy_data_for_qwen(
     # the data will get resized and the # of tokens per image is constant
     image = Image.new("RGB", (224, 224), color=0)
     mm_data = {"image": image if num_images == 1 else [image] * num_images}
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data)
 
 
 class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3d049eeb920b7..6114548bda42c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -31,8 +31,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -44,6 +44,7 @@
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -85,7 +86,8 @@ def forward(self, audio_features):
 def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
                                mm_counts: Mapping[str, int]):
     num_audios = mm_counts["audio"]
-    max_llm_audio_tokens = get_max_qwen2_audio_audio_tokens(ctx) * num_audios
+    max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx)
+    max_llm_audio_tokens = max_tokens_per_audio * num_audios
     if seq_len - max_llm_audio_tokens - 2 < 0:
         raise RuntimeError(
             f"Qwen2-Audio cannot process {num_audios} audios in a prompt, "
@@ -99,7 +101,12 @@ def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
         (0, seq_len - max_llm_audio_tokens),
     )
     dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
-    return dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}
+    return DummyData(
+        dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
+            "audio":
+            consecutive_placeholder_ranges(num_items=num_audios,
+                                           item_size=max_tokens_per_audio)
+        })
 
 
 def get_processor(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1e12c2332b65e..d801903f8f9fe 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -44,8 +44,8 @@
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU
@@ -744,9 +744,10 @@ def dummy_data_for_qwen2_vl(
     dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
                             color=0)
 
-    return dummy_seqdata, {
-        "image": dummy_image if num_images == 1 else [dummy_image] * num_images
-    }
+    return DummyData(dummy_seqdata, {
+        "image":
+        dummy_image if num_images == 1 else [dummy_image] * num_images
+    })
 
 
 def _get_llm_num_vision_tokens(
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2e7ae32055aaf..acaf4afdecfe5 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -23,6 +23,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
@@ -61,6 +62,7 @@ def dummy_seq_data_for_siglip(
     *,
     image_token_id: int,
     image_feature_size_override: Optional[int] = None,
+    mm_key: str = "image",
 ):
     if image_feature_size_override is None:
         image_feature_size = get_siglip_image_feature_size(hf_config)
@@ -70,7 +72,11 @@ def dummy_seq_data_for_siglip(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        mm_key:
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_siglip(
@@ -122,6 +128,11 @@ def input_processor_for_siglip(
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
     if image_feature_size_override is None:
@@ -135,7 +146,7 @@ def input_processor_for_siglip(
     else:
         image_feature_size = image_feature_size_override
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -144,11 +155,10 @@ def input_processor_for_siglip(
     )
 
     # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data=multi_modal_data,
-    )
+    return token_inputs(prompt_token_ids=new_token_ids,
+                        prompt=new_prompt,
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": ranges})
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index f08e4aa355086..749750fc9c16e 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,7 +2,6 @@
 """PyTorch Ultravox model."""
 
 import math
-from array import array
 from functools import cached_property, lru_cache
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union, cast)
@@ -17,27 +16,27 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import INPUT_REGISTRY
-from vllm.inputs.data import DecoderOnlyInputs, token_inputs
-from vllm.inputs.registry import InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs, NestedTensors
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
+                             NestedTensors)
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
+from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, merge_multimodal_embeddings)
+                    init_vllm_registered_model,
+                    merge_multimodal_embeddings_from_map)
 
 _AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
@@ -46,13 +45,13 @@
 class UltravoxAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
     data: NestedTensors
-    """Shape: `(batch_size, num_audios, 80, M)"""
+    """Shape: `(batch_size, num_audios, 80, M)`"""
 
 
 class UltravoxAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
     data: NestedTensors
-    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`"""
 
 
 UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
@@ -79,17 +78,16 @@ def dummy_seq_data_for_ultravox(
     seq_len: int,
     audio_count: int,
 ):
-    audio_placeholder = array(
-        VLLM_TOKEN_ID_ARRAY_TYPE,
-        [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
+    audio_length = min(get_ultravox_max_audio_tokens(ctx),
+                       seq_len // audio_count)
 
-    # Add a separator between each chunk.
-    audio_token_ids = (audio_placeholder +
-                       array(VLLM_TOKEN_ID_ARRAY_TYPE, [0])) * audio_count
-    other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                            [0]) * (seq_len - len(audio_token_ids))
-
-    return SequenceData(audio_token_ids + other_token_ids)
+    return SequenceData.from_prompt_token_counts(
+        (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count),
+        (0, seq_len - audio_length * audio_count)), {
+            "audio":
+            consecutive_placeholder_ranges(num_items=audio_count,
+                                           item_size=audio_length)
+        }
 
 
 def dummy_audio_for_ultravox(
@@ -107,10 +105,10 @@ def dummy_data_for_ultravox(
     mm_counts: Mapping[str, int],
 ):
     audio_count = mm_counts["audio"]
-    seq_data = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
+    seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
     mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
 
-    return (seq_data, mm_dict)
+    return DummyData(seq_data, mm_dict, ranges)
 
 
 def input_mapper_for_ultravox(ctx: InputContext, data: object):
@@ -164,6 +162,11 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
     if multi_modal_data is None or "audio" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "audio" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     feature_extractor = whisper_feature_extractor(ctx)
     audios = multi_modal_data["audio"]
     if not isinstance(audios, list):
@@ -197,7 +200,7 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -208,7 +211,8 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"audio": ranges})
 
 
 class StackAudioFrames(nn.Module):
@@ -472,9 +476,9 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 inputs_embeds = self.language_model.model.get_input_embeddings(
                     input_ids)
 
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, audio_embeddings,
-                    _AUDIO_PLACEHOLDER_TOKEN)
+                merge_multimodal_embeddings_from_map(
+                    inputs_embeds, audio_embeddings,
+                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
                 input_ids = None
             else:
                 inputs_embeds = None
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 0aecb5d151a45..c6ec1769fc5d1 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.model_loader.loader import build_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import ModelRegistry
-from vllm.multimodal.base import NestedTensors
+from vllm.multimodal.base import MultiModalPlaceholderMap, NestedTensors
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_pin_memory_available
@@ -326,6 +326,22 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
         _embedding_count_expression(inner) for inner in embeddings)
 
 
+def merge_multimodal_embeddings_from_map(
+        inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
+        placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided 
+    placeholder map .
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
+    inputs_embeds[placeholder_map.dest] = flattened_embeddings[
+        placeholder_map.src]
+    return inputs_embeds
+
+
 def _merge_multimodal_embeddings(
     inputs_embeds: torch.Tensor,
     is_multimodal: torch.Tensor,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 489e1e51f05cb..53da2badb9b98 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,6 +1,7 @@
 from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
-                   MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
-                   NestedTensors)
+                   MultiModalDataDict, MultiModalInputs,
+                   MultiModalPlaceholderDict, MultiModalPlaceholderMap,
+                   MultiModalPlugin, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -17,6 +18,8 @@
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalInputs",
+    "MultiModalPlaceholderDict",
+    "MultiModalPlaceholderMap",
     "MultiModalPlugin",
     "NestedTensors",
     "MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 84e71cbf60df7..6b10d0c609f13 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,8 +1,9 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (Any, Callable, Dict, List, Mapping, Optional, Tuple, Type,
-                    TypedDict, TypeVar, Union, cast, final)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping,
+                    NamedTuple, Optional, Tuple, Type, TypedDict, TypeVar,
+                    Union, cast, final)
 
 import numpy as np
 import torch
@@ -11,12 +12,15 @@
 from torch import nn
 from typing_extensions import TypeAlias
 
-from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of,
                         json_map_leaves, resolve_mm_processor_kwargs)
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.sequence import SequenceGroupMetadata
+
 logger = init_logger(__name__)
 
 NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
@@ -151,6 +155,30 @@ class MultiModalDataBuiltins(TypedDict, total=False):
     Read more on that :ref:`here <adding_multimodal_plugin>`.
 """
 
+
+class PlaceholderRange(TypedDict):
+    """
+    Placeholder location information for multi-modal data.
+
+    For example:
+        Prompt: AAAA BBBB What is in these images?
+        Images A and B will have:
+            A: { "offset": 0, "length": 4 }
+            B: { "offset": 5, "length": 4 }
+    """
+
+    offset: int
+    """The start index of the placeholder in the prompt."""
+
+    length: int
+    """The length of the placeholder."""
+
+
+MultiModalPlaceholderDict = Mapping[str, List[PlaceholderRange]]
+"""
+A dictionary containing placeholder ranges.
+"""
+
 MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
                                  MultiModalInputs]
 """
@@ -243,7 +271,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def map_input(self, model_config: ModelConfig,
+    def map_input(self, model_config: "ModelConfig",
                   data: MultiModalData[object],
                   mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs:
         """
@@ -332,7 +360,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
         for profiling the memory usage of a model.
@@ -366,3 +394,179 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
         self._validate_max_multimodal_tokens(max_mm_tokens)
 
         return max_mm_tokens
+
+
+class MultiModalPlaceholderMap:
+    """
+    Relates multi-modal embeddings to their corresponding placeholders.
+    """
+
+    class IndexMap(NamedTuple):
+        src: List[int]
+        dest: List[int]
+
+    src_ranges: List[range]
+    """
+    The indices of the multi-modal embeddings that will replace the
+    corresponding placeholder embeddings pointed to by ``dest_ranges``.
+    """
+
+    src_len: int
+    """
+    The total number of flattened multi-modal embeddings.
+    """
+
+    dest_ranges: List[range]
+    """
+    The indices of the placeholder embeddings that will be replaced by the
+    multimodal embeddings.
+    """
+
+    dest_len: int
+    """
+    The total number of embeddings in the destination tensor.
+    """
+
+    def __init__(self):
+        self.src_ranges = []
+        self.src_len = 0
+        self.dest_ranges = []
+        self.dest_len = 0
+
+    @classmethod
+    def from_seq_group(
+        cls, seq_group: "SequenceGroupMetadata", positions: range
+    ) -> Tuple[Optional[MultiModalDataDict], Dict[str,
+                                                  "MultiModalPlaceholderMap"]]:
+        """
+        Returns the multi-modal items that intersect with the portion of a
+        prompt (``seq_group``) represented by ``positions``, as well as a
+        ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
+        vectors to their corresponding placeholders.
+
+        Consider the following scenarios:
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |.................................|
+
+            images      = [A, B]
+            src_ranges  = [(0, 4), (4, 8)]
+            dest_ranges = [(0, 4), (5, 9)]
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |  .....                          |
+
+            images      = [A, B]
+            src_ranges  = [(2, 4), (4, 6)]
+            dest_ranges = [(0, 2), (3, 5)]
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |     .........                   |
+
+            images      = [B]
+            src_ranges  = [(0, 4)]
+            dest_ranges = [(0, 4)]
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |          .......................|
+
+            images      = []
+            src_ranges  = []
+            dest_ranges = []
+        """
+        if (not seq_group.multi_modal_data
+                or not seq_group.multi_modal_placeholders):
+            return seq_group.multi_modal_data, {}
+
+        mm_data = {**seq_group.multi_modal_data}
+        placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict(
+            MultiModalPlaceholderMap)
+
+        for modality, placeholders in seq_group.multi_modal_placeholders.items(
+        ):
+            mm_items = mm_data.pop(modality)
+            if not isinstance(mm_items, list):
+                mm_items = [mm_items]
+
+            if positions:
+                intersecting_items = placeholder_maps[
+                    modality].append_items_from_seq_group(
+                        positions, mm_items, placeholders)
+
+                if intersecting_items:
+                    mm_data[modality] = intersecting_items
+
+        return mm_data, placeholder_maps
+
+    def append_items_from_seq_group(
+            self, positions: range, multi_modal_items: List[_T],
+            multi_modal_placeholders: List[PlaceholderRange]) -> List[_T]:
+        """
+        Adds the multi-modal items that intersect ```positions`` to this
+        placeholder map and returns the intersecting items.
+        """
+        intersecting_items = []
+
+        if len(multi_modal_items) != len(multi_modal_placeholders):
+            raise ValueError(
+                "Multi-modal placeholders and items must have the same length."
+            )
+        for placeholder_dict, mm_item in zip(multi_modal_placeholders,
+                                             multi_modal_items):
+            placeholder = range(
+                placeholder_dict["offset"],
+                placeholder_dict["offset"] + placeholder_dict["length"])
+            intersection = range(max(positions.start, placeholder.start),
+                                 min(positions.stop, placeholder.stop))
+
+            if not intersection:
+                # Skip this multi-modal item.
+                continue
+
+            token_embedding_range = range(intersection.start - positions.start,
+                                          intersection.stop - positions.start)
+
+            multimodal_embedding_range = range(
+                intersection.start - placeholder.start + self.src_len,
+                intersection.stop - placeholder.start + self.src_len)
+
+            intersecting_items.append(mm_item)
+            self.dest_ranges.append(token_embedding_range)
+            self.src_ranges.append(multimodal_embedding_range)
+            self.src_len += len(placeholder)
+
+        self.dest_len += len(positions)
+        return intersecting_items
+
+    def extend(self, other: "MultiModalPlaceholderMap"):
+        """
+        Adds the placeholders from another ``MultiModalPlaceholderMap`` to this
+        instance based on the source and destination tensors being
+        concatenated.
+        """
+
+        self.src_ranges.extend(
+            range(self.src_len + r.start, self.src_len + r.stop)
+            for r in other.src_ranges)
+        self.src_len += other.src_len
+        self.dest_ranges.extend(
+            range(self.dest_len + r.start, self.dest_len + r.stop)
+            for r in other.dest_ranges)
+        self.dest_len += other.dest_len
+
+    def index_map(self) -> "IndexMap":
+        """
+        Finalizes the placeholder map into lists of indices that can be used to
+        index the source and destination tensors.
+        """
+
+        src_indices = [i for r in self.src_ranges for i in r]
+        dest_indices = [i for r in self.dest_ranges for i in r]
+
+        if len(src_indices) != len(dest_indices):
+            raise ValueError(
+                f"The number of source ({len(src_indices)}) and destination "
+                f"indices ({len(dest_indices)}) must be the same.")
+
+        return MultiModalPlaceholderMap.IndexMap(src=src_indices,
+                                                 dest=dest_indices)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 5f74bcea65ce2..3f6bb6c8338d2 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,11 +1,10 @@
 from functools import lru_cache
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 from PIL import Image
 from transformers.image_processing_base import BatchFeature
 
-from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_image_processor
@@ -13,6 +12,9 @@
 
 from .base import MultiModalData, MultiModalInputs, MultiModalPlugin
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 cached_get_image_processor = lru_cache(get_image_processor)
@@ -26,7 +28,7 @@ def get_data_key(self) -> str:
 
     def _get_hf_image_processor(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5e9b8bd518de3..bce2f4c6abe5b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,8 +1,7 @@
 import functools
 from collections import UserDict
-from typing import Any, Dict, Mapping, Optional, Sequence
+from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence
 
-from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
 from .audio import AudioPlugin
@@ -11,6 +10,9 @@
 from .image import ImagePlugin
 from .video import VideoPlugin
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 
@@ -20,7 +22,7 @@ class _MultiModalLimits(UserDict):
     when attempting to access a model that does not exist.
     """
 
-    def __getitem__(self, key: ModelConfig) -> Dict[str, int]:
+    def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
         try:
             return super().__getitem__(key)
         except KeyError as exc:
@@ -98,7 +100,7 @@ def register_image_input_mapper(
 
     def map_input(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
         data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
     ) -> MultiModalInputs:
@@ -139,7 +141,7 @@ def map_input(
 
         return MultiModalInputs(merged_dict)
 
-    def create_input_mapper(self, model_config: ModelConfig):
+    def create_input_mapper(self, model_config: "ModelConfig"):
         """
         Create an input mapper (see :meth:`map_input`) for a specific model.
         """
@@ -177,7 +179,7 @@ def register_max_image_tokens(
         """
         return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
-    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
         for profiling the memory usage of a model.
@@ -195,7 +197,7 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
 
     def init_mm_limits_per_prompt(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
     ) -> None:
         """
         Initialize the maximum number of multi-modal input instances for each
@@ -231,7 +233,7 @@ def init_mm_limits_per_prompt(
 
     def get_mm_limits_per_prompt(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3c801464383ad..c5ff552e06099 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -10,7 +10,7 @@
 from vllm.connections import global_http_connection
 from vllm.envs import VLLM_AUDIO_FETCH_TIMEOUT, VLLM_IMAGE_FETCH_TIMEOUT
 from vllm.logger import init_logger
-from vllm.multimodal.base import MultiModalDataDict
+from vllm.multimodal.base import MultiModalDataDict, PlaceholderRange
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
 logger = init_logger(__name__)
@@ -258,7 +258,7 @@ def repeat_and_pad_placeholder_tokens(
     repeat_count: Union[int, List[int]],
     pad_token_left: Optional[int] = None,
     pad_token_right: Optional[int] = None,
-) -> Tuple[Optional[str], List[int]]:
+) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]:
     if isinstance(repeat_count, int):
         repeat_count = [repeat_count]
 
@@ -301,6 +301,7 @@ def repeat_and_pad_placeholder_tokens(
         new_prompt += prompt_parts[-1]
 
     new_token_ids: List[int] = []
+    placeholder_ranges: List[PlaceholderRange] = []
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
@@ -310,6 +311,10 @@ def repeat_and_pad_placeholder_tokens(
                 pad_token_left=pad_token_left,
                 pad_token_right=pad_token_right,
             )
+            placeholder_ranges.append({
+                "offset": len(new_token_ids),
+                "length": len(replacement_ids)
+            })
             new_token_ids.extend(replacement_ids)
             placeholder_token_idx += 1
 
@@ -320,4 +325,14 @@ def repeat_and_pad_placeholder_tokens(
         else:
             new_token_ids.append(token)
 
-    return new_prompt, new_token_ids
+    return new_prompt, new_token_ids, placeholder_ranges
+
+
+def consecutive_placeholder_ranges(num_items: int,
+                                   item_size: int) -> List[PlaceholderRange]:
+    """Returns a list of consecutive PlaceholderRanges of a fixed size"""
+
+    return [
+        PlaceholderRange(offset=i * item_size, length=item_size)
+        for i in range(num_items)
+    ]
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index c3235c4acb6fd..6c2c6720f4276 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,18 +1,19 @@
 from functools import lru_cache
-from typing import Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import numpy as np
 
-from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_video_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_list_of
 
 from .base import MultiModalData, MultiModalInputs
 from .image import ImagePlugin
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 cached_get_video_processor = lru_cache(get_video_processor)
@@ -38,7 +39,7 @@ def get_data_key(self) -> str:
 
     def _get_hf_video_processor(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
@@ -56,7 +57,10 @@ def _default_input_mapper(
     ) -> MultiModalInputs:
         model_config = ctx.model_config
 
-        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
+        if isinstance(data, list) and len(data) == 1:
+            data = data[0]
+
+        if isinstance(data, np.ndarray):
             video_processor = self._get_hf_video_processor(
                 model_config,
                 mm_processor_kwargs,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ff59f333f00b4..ee547dde45394 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -15,13 +15,13 @@
 
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
     from vllm.inputs import SingletonInputs
-    from vllm.multimodal.base import MultiModalDataDict
 
 VLLM_TOKEN_ID_ARRAY_TYPE = "l"
 
@@ -485,7 +485,7 @@ def prompt_token_ids(self) -> List[int]:
         return cast(List[int], self.inputs.get(prompt_token_ids_key))
 
     @property
-    def multi_modal_data(self) -> "MultiModalDataDict":
+    def multi_modal_data(self) -> MultiModalDataDict:
         inputs = self.inputs
 
         if (inputs.get("multi_modal_data")
@@ -495,11 +495,15 @@ def multi_modal_data(self) -> "MultiModalDataDict":
             )
 
         return cast(
-            "MultiModalDataDict",
+            MultiModalDataDict,
             (inputs.get("multi_modal_data")
              or inputs.get("encoder_multi_modal_data") or {}),
         )
 
+    @property
+    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+        return self.inputs.get("multi_modal_placeholders") or {}
+
     @property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
         return self.inputs.get("mm_processor_kwargs") or {}
@@ -728,9 +732,13 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]:
                 if self.encoder_seq is not None else None)
 
     @property
-    def multi_modal_data(self) -> "MultiModalDataDict":
+    def multi_modal_data(self) -> MultiModalDataDict:
         return self.first_seq.multi_modal_data
 
+    @property
+    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+        return self.first_seq.multi_modal_placeholders
+
     @property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
         return self.first_seq.mm_processor_kwargs
@@ -946,6 +954,7 @@ class SequenceGroupMetadata(
     # "MultiModalDataDict" types. We have to use Any due to msgspec
     # doesn't allow to have union of 2 different dicts.
     multi_modal_data: Optional[Any] = None
+    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     encoder_seq_data: Optional[SequenceData] = None
     cross_block_table: Optional[List[int]] = None
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 5032896600b3b..0c6fcdf03ba9e 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,5 +1,6 @@
 import dataclasses
 import weakref
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
@@ -16,7 +17,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs)
+                             MultiModalInputs, MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
 from vllm.transformers_utils.config import uses_mrope
@@ -148,9 +149,18 @@ def build(self) -> ModelInputForCPU:
             query_lens=seq_lens,
         )
 
-    def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data,
-                                   computed_len: int,
+    def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata,
+                                   seq_data: SequenceData, computed_len: int,
                                    mm_processor_kwargs: Dict[str, Any]):
+
+        # NOTE: mm_data only includes the subset of multi-modal items that
+        # intersect with the current prefill positions.
+        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+            seq_group, range(computed_len, len(seq_data.get_token_ids())))
+
+        if not mm_data:
+            return
+
         mm_kwargs = self.multi_modal_input_mapper(mm_data, mm_processor_kwargs)
 
         # special processing for mrope position deltas.
@@ -179,7 +189,7 @@ def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data,
                     context_len=computed_len,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
-        return mm_kwargs, mrope_positions
+        return mm_kwargs, placeholder_maps, mrope_positions
 
     def _prepare_prompt(
         self,
@@ -194,6 +204,9 @@ def _prepare_prompt(
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
         multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_modal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -210,11 +223,15 @@ def _prepare_prompt(
             input_tokens.extend(prompt_tokens)  # Token ids
 
             mrope_positions = None
-            if (mm_data := seq_group_metadata.multi_modal_data):
-                mm_kwargs, mrope_positions = self._compute_multi_modal_input(
-                    seq_data, mm_data, computed_len,
+            if seq_group_metadata.multi_modal_data:
+                mm_kwargs, placeholder_maps, mrope_positions = self \
+                    ._compute_multi_modal_input(
+                        seq_group_metadata, seq_data, computed_len,
                     seq_group_metadata.mm_processor_kwargs)
                 multi_modal_inputs_list.append(mm_kwargs)
+                for modality, placeholder_map in placeholder_maps.items():
+                    multi_modal_placeholder_maps[modality].extend(
+                        placeholder_map)
 
             # Token position ids
             # NOTE(woosuk): Here we assume that the first token in the prompt
@@ -264,6 +281,11 @@ def _prepare_prompt(
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)  # type: ignore
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            multi_modal_placeholder_maps.items()
+        }
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
@@ -275,6 +297,7 @@ def _prepare_prompt(
             num_decode_tokens=0,
             block_tables=torch.tensor([]),
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
         multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
@@ -366,6 +389,7 @@ def _prepare_decode(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_decode_seq_len=max_decode_seq_len,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 6a00444f5098b..a4b665d71f28a 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -306,13 +306,12 @@ def profile_run(self) -> None:
                        (group_id < max_num_batched_tokens % max_num_seqs))
             batch_size += seq_len
 
-            decoder_seq_data, decoder_dummy_multi_modal_data \
-                = self.input_registry.dummy_data_for_profiling(
-                    self.model_config,
+            decoder_dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
                                           seq_len,
                                           self.mm_registry,
                                           is_encoder_data=False)
-            encoder_seq_data, encoder_dummy_multi_modal_data \
+            encoder_dummy_data \
                 = self.input_registry.dummy_data_for_profiling(
                     self.model_config,
                                          seq_len,
@@ -320,26 +319,31 @@ def profile_run(self) -> None:
                                          is_encoder_data=True)
 
             # Having more tokens is over-conservative but otherwise fine
-            assert len(decoder_seq_data.prompt_token_ids) >= seq_len, (
+            assert len(
+                decoder_dummy_data.seq_data.prompt_token_ids
+            ) >= seq_len, (
                 f"Expected at least {seq_len} dummy tokens for profiling, "
-                f"but got: {len(decoder_seq_data.prompt_token_ids)}")
+                f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}"
+            )
 
-            assert decoder_dummy_multi_modal_data is None or \
-            encoder_dummy_multi_modal_data is None, (
+            assert decoder_dummy_data.multi_modal_data is None or \
+            encoder_dummy_data.multi_modal_data is None, (
                 "Multi-modal data can't be provided in both encoder and decoder"
             )
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
-                seq_data={group_id: decoder_seq_data},
+                seq_data={group_id: decoder_dummy_data.seq_data},
                 sampling_params=sampling_params,
                 block_tables=None,
-                encoder_seq_data=encoder_seq_data,
+                encoder_seq_data=encoder_dummy_data.seq_data,
                 cross_block_table=None,
-                multi_modal_data=decoder_dummy_multi_modal_data
-                or encoder_dummy_multi_modal_data,
-            )
+                multi_modal_data=decoder_dummy_data.multi_modal_data
+                or encoder_dummy_data.multi_modal_data,
+                multi_modal_placeholders=decoder_dummy_data.
+                multi_modal_placeholders
+                or encoder_dummy_data.multi_modal_placeholders)
             seqs.append(seq)
 
         # Run the model with the dummy inputs.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 891637dafbb14..f2123c64c3274 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -40,7 +40,8 @@
 from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalRegistry)
+                             MultiModalInputs, MultiModalPlaceholderMap,
+                             MultiModalRegistry)
 from vllm.platforms import current_platform
 from vllm.prompt_adapter.layers import PromptAdapterMapping
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -242,6 +243,8 @@ def __init__(
 
             # Multi-modal inputs.
             multi_modal_inputs: Optional[MultiModalInputs] = None,
+            multi_modal_placeholder_maps: Optional[Dict[
+                str, MultiModalPlaceholderMap]] = None,
 
             # Whether the prefix cache is hit (prefill only).
             prefix_cache_hit: bool = False,
@@ -361,6 +364,7 @@ def __init__(
 
             self.prompt_adapter_request = prompt_adapter_request
             self.multi_modal_inputs = multi_modal_inputs
+            self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
             self.prefix_cache_hit = prefix_cache_hit
 
             self.n_seqs = len(self.seq_ids)
@@ -635,7 +639,12 @@ def _compute_prompt_adapter_input(
     def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                                    seq_group_metadata: SequenceGroupMetadata):
         """If multi-modal data is given, add it to the input."""
-        mm_data = seq_group_metadata.multi_modal_data
+        # NOTE: mm_data only includes the subset of multi-modal items that
+        # intersect with the current prefill positions.
+        positions = inter_data.input_positions[0]
+        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+            seq_group_metadata,
+            range(positions[0], positions[0] + len(positions)))
         if not mm_data:
             return
 
@@ -643,6 +652,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
             mm_data,
             mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs)
         inter_data.multi_modal_inputs = mm_kwargs
+        inter_data.multi_modal_placeholder_maps = placeholder_maps
 
         # special processing for mrope position deltas.
         if self.runner.model_is_mrope:
@@ -1255,7 +1265,7 @@ def profile_run(self) -> None:
                        (group_id < max_num_batched_tokens % max_num_seqs))
             batch_size += seq_len
 
-            seq_data, dummy_multi_modal_data = self.input_registry \
+            dummy_data = self.input_registry \
                 .dummy_data_for_profiling(self.model_config,
                                           seq_len,
                                           self.mm_registry)
@@ -1263,12 +1273,13 @@ def profile_run(self) -> None:
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
-                seq_data={group_id: seq_data},
+                seq_data={group_id: dummy_data.seq_data},
                 sampling_params=sampling_params,
                 block_tables=None,
                 lora_request=dummy_lora_requests_per_seq[group_id]
                 if dummy_lora_requests_per_seq else None,
-                multi_modal_data=dummy_multi_modal_data,
+                multi_modal_data=dummy_data.multi_modal_data,
+                multi_modal_placeholders=dummy_data.multi_modal_placeholders,
             )
             seqs.append(seq)
 
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 86883cf152449..89d7addb5a8d9 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -46,9 +46,8 @@ def _init_attn_metadata_from_tensor_dict(
     # Extract the fields used to create AttentionMetadata.
     valid_attn_kwargs = {}
     for field in dataclasses.fields(attn_backend.get_metadata_cls()):
-        val = tensor_dict.pop(field.name, None)
-        if val is not None:
-            valid_attn_kwargs[field.name] = val
+        if field.name in tensor_dict:
+            valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
 
     attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
     tensor_dict["attn_metadata"] = attn_metadata
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index a164fbe3393c4..3da738636a59d 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, NamedTuple, Optional, Tuple
+from collections import defaultdict
+from typing import Dict, List, NamedTuple, Optional, Tuple
 
 import openvino as ov
 import torch
@@ -14,7 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.openvino import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs)
+                             MultiModalInputs, MultiModalPlaceholderMap)
 from vllm.sequence import SequenceGroupMetadata
 
 logger = init_logger(__name__)
@@ -115,6 +116,9 @@ def _prepare_model_input(
         past_lens: List[int] = []
         query_lens: List[int] = []
         multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_modal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
 
         subsequence_begins: List[int] = []
         block_indices: List[int] = []
@@ -168,15 +172,6 @@ def _prepare_model_input(
                                     and self.sliding_window is None
                                     and is_prompt)
 
-                mm_data = seq_group_metadata.multi_modal_data
-                if mm_data:
-                    mm_kwargs = self.multi_modal_input_mapper(
-                        mm_data,
-                        mm_processor_kwargs=seq_group_metadata.
-                        mm_processor_kwargs,
-                    )
-                    multi_modal_inputs_list.append(mm_kwargs)
-
                 block_table = seq_group_metadata.block_tables[seq_id]
                 # TODO(sang): Combine chunked prefill and prefix caching by
                 # only allowing multiple of block_size chunk size.
@@ -220,7 +215,8 @@ def _prepare_model_input(
                 query_lens.append(query_len)
 
                 input_tokens.extend(tokens)
-                input_positions.extend(list(range(computed_len, seq_len)))
+                positions_range = range(computed_len, seq_len)
+                input_positions.extend(list(positions_range))
 
                 past_lens.append(computed_len)
                 subsequence_begins.append(subsequence_begins[-1] + query_len)
@@ -233,6 +229,22 @@ def _prepare_model_input(
                     ), "seq_len: {}, computed_len: {}, query_len: {}".format(
                         seq_len, computed_len, query_len)
 
+                if seq_group_metadata.multi_modal_data:
+                    # NOTE: mm_data only includes the subset of multi-modal
+                    # items that intersect with the current prefill positions.
+                    mm_data, placeholder_maps = MultiModalPlaceholderMap \
+                        .from_seq_group(seq_group_metadata, positions_range)
+
+                    mm_kwargs = self.multi_modal_input_mapper(
+                        mm_data,
+                        mm_processor_kwargs=seq_group_metadata.
+                        mm_processor_kwargs)
+                    multi_modal_inputs_list.append(mm_kwargs)
+
+                    for modality, placeholder_map in placeholder_maps.items():
+                        multi_modal_placeholder_maps[modality].extend(
+                            placeholder_map, )
+
         max_query_len = max(query_lens)
         assert max_query_len > 0, "query_lens: {}".format(query_lens)
 
@@ -261,12 +273,19 @@ def _prepare_model_input(
             max_context_len, dtype=torch.int32,
             device=self.device)  # type: ignore
 
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            multi_modal_placeholder_maps.items()
+        }
+
         attn_metadata = self.attn_backend.make_openvino_metadata(
             past_lens=past_lens_tensor,
             subsequence_begins=subsequence_begins_tensor,
             block_indices=block_indices_tensor,
             block_indices_begins=block_indices_begins_tensor,
             max_context_len=max_context_len_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
         multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 87ced7818a676..3792cbc0f730f 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -184,6 +184,7 @@ def _dummy_run(
                 num_prefill_tokens=batch_size * seq_len,
                 num_decode_tokens=0,
                 slot_mapping=slot_mapping,
+                multi_modal_placeholder_index_maps=None,
                 block_tables=None,
                 context_lens=None,
             )
@@ -216,6 +217,7 @@ def _dummy_run(
                 num_prefill_tokens=0,
                 num_decode_tokens=batch_size * seq_len,
                 slot_mapping=slot_mapping,
+                multi_modal_placeholder_index_maps=None,
                 block_tables=block_tables,
                 context_lens=context_lens,
             )
@@ -360,6 +362,7 @@ def _prepare_prompt(
             num_prefill_tokens=0,  # NOTE: This is not used.
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             block_tables=None,
             context_lens=None,
         )
@@ -429,6 +432,7 @@ def _prepare_decode(
             num_prefill_tokens=0,
             num_decode_tokens=batch_size,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             block_tables=block_tables,
             context_lens=context_lens,
         )
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 75a6de3b24ba4..739fe1b3d2c4f 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,6 +1,7 @@
 import dataclasses
 import time
 import weakref
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
                     Type, TypeVar)
@@ -19,7 +20,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalRegistry)
+                             MultiModalInputs, MultiModalPlaceholderMap,
+                             MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
@@ -161,6 +163,9 @@ def _prepare_prompt(
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
         multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_modal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -179,7 +184,21 @@ def _prepare_prompt(
             # Token position ids
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
-            input_positions.extend(list(range(computed_len, seq_len)))
+            positions_range = range(computed_len, seq_len)
+            input_positions.extend(list(positions_range))
+
+            if seq_group_metadata.multi_modal_data:
+                # NOTE: mm_data only includes the subset of multi-modal items
+                # that intersect with the current prefill positions.
+                mm_data, placeholder_maps = MultiModalPlaceholderMap \
+                    .from_seq_group(seq_group_metadata, positions_range)
+
+                mm_kwargs = self.runner.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
+                for modality, placeholder_map in placeholder_maps.items():
+                    multi_modal_placeholder_maps[modality].extend(
+                        placeholder_map)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -220,6 +239,11 @@ def _prepare_prompt(
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)  # type: ignore
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            multi_modal_placeholder_maps.items()
+        }
 
         max_seqlen = max(seq_lens)
         tmp = [0]
@@ -230,6 +254,7 @@ def _prepare_prompt(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             seq_lens=seq_lens,
             seqlen_q=seqlen_q,
             max_seqlen=max_seqlen,
@@ -313,6 +338,7 @@ def _prepare_decode(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens=seq_lens,
             seqlen_q=torch.tensor([]),
             max_seqlen=0,
@@ -450,7 +476,7 @@ def profile_run(self) -> None:
                        (group_id < max_num_batched_tokens % max_num_seqs))
             batch_size += seq_len
 
-            seq_data, dummy_multi_modal_data = self.input_registry \
+            dummy_data = self.input_registry \
                 .dummy_data_for_profiling(self.model_config,
                                           seq_len,
                                           self.mm_registry)
@@ -458,12 +484,12 @@ def profile_run(self) -> None:
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
-                seq_data={group_id: seq_data},
+                seq_data={group_id: dummy_data.seq_data},
                 sampling_params=sampling_params,
                 block_tables=None,
                 lora_request=None,
-                multi_modal_data=dummy_multi_modal_data,
-            )
+                multi_modal_data=dummy_data.multi_modal_data,
+                multi_modal_placeholders=dummy_data.multi_modal_placeholders)
             seqs.append(seq)
 
         # Run the model with the dummy inputs.

From d522034c85e8f994bbd193514393056232edd247 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 1 Nov 2024 13:56:13 -1000
Subject: [PATCH 19/43] [ci/build] Have dependabot ignore pinned dependencies
 (#9935)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/dependabot.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index a21acd9671eeb..4f54eea564ecb 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -14,6 +14,15 @@ updates:
     reviewers: ["khluu", "simon-mo"]
     allow:
       - dependency-type: "all"
+    ignore:
+      - dependency-name: "torch"
+      - dependency-name: "torchvision"
+      - dependency-name: "xformers"
+      - dependency-name: "lm-format-enforcer"
+      - dependency-name: "gguf"
+      - dependency-name: "compressed-tensors"
+      - dependency-name: "ray[adag]"
+      - dependency-name: "lm-eval"
     groups:
       patch-update:
         applies-to: version-updates

From a78dd3303efac284afc6785eddba5f175285863b Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Fri, 1 Nov 2024 23:22:49 -0700
Subject: [PATCH 20/43] [Encoder Decoder] Add flash_attn kernel support for
 encoder-decoder models (#9559)

---
 tests/encoder_decoder/test_e2e_correctness.py |  88 +++--
 tests/kernels/test_encoder_decoder_attn.py    | 156 ++++++--
 tests/kernels/utils.py                        |  90 ++++-
 .../vision_language/test_florence2.py         |   2 +-
 vllm/attention/backends/flash_attn.py         | 364 +++++++++++++-----
 vllm/attention/backends/utils.py              | 159 +++++++-
 vllm/attention/backends/xformers.py           | 131 ++-----
 vllm/attention/selector.py                    |   2 +-
 vllm/model_executor/models/bart.py            |   2 -
 vllm/utils.py                                 |   4 +-
 vllm/worker/enc_dec_model_runner.py           |  35 +-
 11 files changed, 716 insertions(+), 317 deletions(-)

diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index bef0c515b9073..f2d7e9fd78cf3 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -7,12 +7,18 @@
 import pytest
 from transformers import AutoModelForSeq2SeqLM
 
+from vllm.attention.selector import (_Backend,
+                                     global_force_attn_backend_context_manager)
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import DecoderPromptType
 from ..models.utils import check_logprobs_close
 
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [
+    _Backend.XFORMERS, _Backend.FLASH_ATTN, None
+]
+
 
 def vllm_to_hf_output(
     vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
@@ -29,7 +35,8 @@ def vllm_to_hf_output(
 
 
 @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@@ -48,6 +55,7 @@ def test_encoder_decoder_e2e(
     num_logprobs: int,
     decoder_prompt_type: DecoderPromptType,
     enforce_eager: bool,
+    attn_backend: _Backend,
 ) -> None:
     '''
     End-to-End (E2E) test for the encoder-decoder framework.
@@ -56,43 +64,49 @@ def test_encoder_decoder_e2e(
     implementations to ensure that both implementations produce consistent
     and correct results.
     '''
-    test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type]
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        test_case_prompts = example_encoder_decoder_prompts[
+            decoder_prompt_type]
 
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
+        # Configuration settings for HF baseline
+        hf_kwargs = {
+            "top_k": None,
+            "num_beams": 1,
+            "repetition_penalty": 1.0,
+            "top_p": 1.0,
+            "length_penalty": 1.0,
+            "early_stopping": False,
+            "no_repeat_ngram_size": None,
+            "min_length": 0
+        }
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            test_case_prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-    with vllm_runner(model, dtype=dtype,
-                     enforce_eager=enforce_eager) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            test_case_prompts, max_tokens, num_logprobs)
+        with hf_runner(model, dtype=dtype,
+                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+            hf_outputs = (
+                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                    test_case_prompts,
+                    max_tokens,
+                    num_logprobs,
+                    **hf_kwargs,
+                ))
+        with vllm_runner(model, dtype=dtype,
+                         enforce_eager=enforce_eager) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                test_case_prompts, max_tokens, num_logprobs)
 
-    hf_skip_tokens = (1
-                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
+        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+                          else 0)
 
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=[
-            vllm_to_hf_output(vllm_output, decoder_prompt_type)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-        num_outputs_0_skip_tokens=hf_skip_tokens,
-    )
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, decoder_prompt_type)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            num_outputs_0_skip_tokens=hf_skip_tokens,
+        )
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index bc99c5559d388..a1dd5eeeaa398 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -16,13 +16,13 @@
 from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
                             AttentionType)
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
-from vllm.attention.selector import (_Backend,
+from vllm.attention.selector import (_Backend, get_attn_backend,
                                      global_force_attn_backend_context_manager)
+from vllm.forward_context import set_forward_context
 from vllm.platforms import current_platform
 
 # List of support backends for encoder/decoder models
-LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS]
-
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
 HEAD_SIZES = [64, 256]
 
 NUM_HEADS = [1, 16]
@@ -145,7 +145,8 @@ class that Attention will automatically select when it is constructed.
                              test_pt.num_heads,
                              test_pt.head_size,
                              test_pt.block_size,
-                             device=CUDA_DEVICE)
+                             device=CUDA_DEVICE,
+                             backend=test_pt.backend_name)
     return TestResources(scale, attn_backend, attn, kv_cache)
 
 
@@ -592,6 +593,7 @@ def _run_encoder_attention_test(
     attn: Attention,
     encoder_test_params: PhaseTestParameters,
     attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
 ) -> torch.Tensor:
     '''
     Run encoder attention.
@@ -610,6 +612,8 @@ def _run_encoder_attention_test(
                            (number_of_tokens x num_heads x head_size)
                            query/key/value fields
     * attn_metadata: attention metadata for encoder/decoder-self attention
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
 
     Returns:
     * Attention.forward() applied to packed {query,key,value} and
@@ -619,20 +623,31 @@ def _run_encoder_attention_test(
     attn_type = AttentionType.ENCODER
     packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
-    return attn.forward(packed_qkv.query,
-                        packed_qkv.key,
-                        packed_qkv.value,
-                        torch.tensor([],
-                                     dtype=torch.float32,
-                                     device=packed_qkv.query.device),
-                        attn_metadata,
-                        attn_type=attn_type)
+    with set_forward_context(attn_metadata):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query,
+                            packed_qkv.key,
+                            packed_qkv.value,
+                            torch.tensor([],
+                                         dtype=torch.float32,
+                                         device=packed_qkv.query.device),
+                            attn_metadata,
+                            attn_type=attn_type)
 
 
 def _run_decoder_self_attention_test(
     test_rsrcs: TestResources,
     decoder_test_params: PhaseTestParameters,
     attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
 ) -> torch.Tensor:
     '''
     Run decoder self-attention test.
@@ -650,6 +665,8 @@ def _run_decoder_self_attention_test(
                            query/key/value fields
     * attn_metadata: attention metadata for decoder-self attention
                      (contains KV cache memory-mapping)
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
 
     Returns:
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
@@ -660,12 +677,22 @@ def _run_decoder_self_attention_test(
     kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
-    return attn.forward(packed_qkv.query,
-                        packed_qkv.key,
-                        packed_qkv.value,
-                        kv_cache,
-                        attn_metadata,
-                        attn_type=attn_type)
+    with set_forward_context(attn_metadata):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query,
+                            packed_qkv.key,
+                            packed_qkv.value,
+                            kv_cache,
+                            attn_metadata,
+                            attn_type=attn_type)
 
 
 def _run_encoder_decoder_cross_attention_test(
@@ -673,6 +700,7 @@ def _run_encoder_decoder_cross_attention_test(
     decoder_test_params: PhaseTestParameters,
     cross_test_params: Optional[PhaseTestParameters],
     attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
 ) -> torch.Tensor:
     '''
     Run encoder/decoder cross-attention test.
@@ -701,6 +729,8 @@ def _run_encoder_decoder_cross_attention_test(
                          (number_of_tokens x num_heads x head_size)
                          key/value fields
     * attn_metadata: attention metadata for encoder/decoder-self attention
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
 
     Returns:
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
@@ -718,12 +748,37 @@ def _run_encoder_decoder_cross_attention_test(
         cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
         key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
         value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
-    return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query,
-                        key,
-                        value,
-                        kv_cache,
-                        attn_metadata,
-                        attn_type=attn_type)
+    with set_forward_context(attn_metadata):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query,
+                            key,
+                            value,
+                            kv_cache,
+                            attn_metadata,
+                            attn_type=attn_type)
+
+
+@pytest.fixture(autouse=True)
+def set_reset_environment(attn_backend):
+    # Set the default torch datatype to bfloat16 to enable
+    # testing of the Flash Attention backend. Also clear the
+    # cached value of the backend.
+    default_dtype = torch.get_default_dtype()
+    if attn_backend.name == 'FLASH_ATTN':
+        torch.set_default_dtype(torch.bfloat16)
+    get_attn_backend.cache_clear()
+    yield
+    # Reset the torch datatype to what it was before the test
+    # so as not to impact the remaining tests.
+    torch.set_default_dtype(default_dtype)
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
@@ -773,10 +828,8 @@ def test_encoder_only(
     * max_dec_seq_len: max length of decoder input sequences
     * max_enc_seq_len: max length of encoder input sequences
     '''
-
     # Force Attention wrapper backend
     with global_force_attn_backend_context_manager(attn_backend):
-
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
@@ -807,10 +860,14 @@ def test_encoder_only(
         # PREFILL: encoder attention
 
         enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
-            test_rsrcs.attn, enc_test_params, prephase_attn_metadata))
+            test_rsrcs.attn,
+            enc_test_params,
+            prephase_attn_metadata,
+            test_pt=test_pt))
 
         # - Is encoder attention result correct?
-        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+                                    attn_backend.name)
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
@@ -892,10 +949,8 @@ def test_e2e_enc_dec_attn(
     * max_dec_seq_len: max length of decoder input sequences
     * max_enc_seq_len: max length of encoder input sequences
     '''
-
     # Force Attention wrapper backend
     with global_force_attn_backend_context_manager(attn_backend):
-
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
@@ -955,29 +1010,39 @@ def test_e2e_enc_dec_attn(
 
         enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
                                                        enc_test_params,
-                                                       prephase_attn_metadata)
+                                                       prephase_attn_metadata,
+                                                       test_pt=test_pt)
 
         # - Is encoder attention result correct?
-        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+                                    attn_backend.name)
 
         # PREFILL: decoder self-attention test
 
         prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs, prephase_dec_test_params, prephase_attn_metadata)
+            test_rsrcs,
+            prephase_dec_test_params,
+            prephase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is prefill decoder self-attention correct?
         assert_actual_matches_ideal(prephase_dec_test_params,
-                                    prephase_dec_pckd_act_out)
+                                    prephase_dec_pckd_act_out,
+                                    attn_backend.name)
 
         # PREFILL: encoder/decoder cross-attention test
 
         prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs, prephase_dec_test_params, prephase_cross_test_params,
-            prephase_attn_metadata)
+            test_rsrcs,
+            prephase_dec_test_params,
+            prephase_cross_test_params,
+            prephase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is prefill encoder/decoder cross-attention correct?
         assert_actual_matches_ideal(prephase_cross_test_params,
-                                    prephase_cross_pckd_act_out)
+                                    prephase_cross_pckd_act_out,
+                                    attn_backend.name)
 
         # DECODE: build decode-phase attention metadata
 
@@ -993,17 +1058,26 @@ def test_e2e_enc_dec_attn(
         # DECODE: decoder self-attention test
 
         decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs, decphase_dec_test_params, decphase_attn_metadata)
+            test_rsrcs,
+            decphase_dec_test_params,
+            decphase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is decode-phase decoder self-attention correct?
         assert_actual_matches_ideal(decphase_dec_test_params,
-                                    decphase_dec_pckd_act_out)
+                                    decphase_dec_pckd_act_out,
+                                    attn_backend.name)
 
         # DECODE: encoder/decoder cross-attention test
 
         decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata)
+            test_rsrcs,
+            decphase_dec_test_params,
+            None,
+            decphase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is decode-phase encoder/decoder cross-attention correct?
         assert_actual_matches_ideal(decphase_cross_test_params,
-                                    decphase_cross_pckd_act_out)
+                                    decphase_cross_pckd_act_out,
+                                    attn_backend.name)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index c3d5252edc2a3..e7865fb2500ef 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,8 +13,8 @@
 
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
-                        make_tensor_with_pad)
+from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
+                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
@@ -525,17 +525,22 @@ def make_backend(backend_name: str) -> AttentionBackend:
     if backend_name == STR_XFORMERS_ATTN_VAL:
         # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
         from vllm.attention.backends.xformers import XFormersBackend
-
         return XFormersBackend()
+    elif backend_name == STR_FLASH_ATTN_VAL:
+        from vllm.attention.backends.flash_attn import FlashAttentionBackend
+        return FlashAttentionBackend()
+
     raise AssertionError(
         f"Unrecognized backend_name {backend_name} for unit test")
 
 
 def _make_metadata_tensors(
-    seq_lens: Optional[List[int]], context_lens: Optional[List[int]],
-    encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str]
-) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]],
-           torch.Tensor, Optional[int]]:
+    seq_lens: Optional[List[int]],
+    context_lens: Optional[List[int]],
+    encoder_seq_lens: Optional[List[int]],
+    device: Union[torch.device, str],
+) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
+           torch.Tensor, torch.Tensor, Optional[int]]:
     '''
     Build scalar & tensor values required to build attention metadata structure.
 
@@ -553,6 +558,8 @@ def _make_metadata_tensors(
     * max_context_len: max(context_lens)
     * max_seq_len: max(seq_lens)
     * seq_start_loc: start idx of each sequence
+    * encoder_seq_lens_tensor: encoder seq_lens list, as tensor
+    * encoder_seq_start_loc: start idx of each encoder sequence
     * max_encoder_seq_len: encoder seq_lens list, as tensor
     '''
     seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
@@ -566,8 +573,26 @@ def _make_metadata_tensors(
 
     seq_start_loc = None
 
+    if seq_lens_tensor is not None:
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=seq_lens_tensor.device)
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+
+    encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1,
+                                        dtype=torch.int32,
+                                        device=encoder_seq_lens_tensor.device)
+    torch.cumsum(encoder_seq_lens_tensor,
+                 dim=0,
+                 dtype=encoder_seq_start_loc.dtype,
+                 out=encoder_seq_start_loc[1:])
+
     return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
-            seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len)
+            seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc,
+            max_encoder_seq_len)
 
 
 def make_kv_cache(num_blocks: int,
@@ -575,6 +600,7 @@ def make_kv_cache(num_blocks: int,
                   head_size: int,
                   block_size: int,
                   device: Union[torch.device, str],
+                  backend: str,
                   default_val: float = 0.0) -> torch.Tensor:
     '''
     Create a fake KV cache.
@@ -591,10 +617,20 @@ def make_kv_cache(num_blocks: int,
     Returns:
 
     * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
+    *     for backend 'XFORMERS' 
+    * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
+    *     for backend 'FLASH_ATTN'  
     '''
-
-    kv_cache = torch.rand(
-        (2, num_blocks, block_size * num_heads * head_size)).to(device)
+    if backend == 'XFORMERS':
+        kv_cache = torch.rand(
+            (2, num_blocks, block_size * num_heads * head_size)).to(device)
+    elif backend == 'FLASH_ATTN':
+        kv_cache = torch.rand(
+            (2, num_blocks, block_size, num_heads, head_size)).to(device)
+    else:
+        raise ValueError(
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+            f"'FLASH_ATTN'.")
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
@@ -858,8 +894,9 @@ def make_test_metadata(
             context_lens_tensor,
             _,
             _,
-            _,
+            seq_start_loc,
             encoder_seq_lens_tensor,
+            encoder_seq_start_loc,
             max_encoder_seq_len,
         ) = _make_metadata_tensors(seq_lens,
                                    context_lens,
@@ -874,6 +911,7 @@ def make_test_metadata(
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
+            seq_start_loc=seq_start_loc,
             max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
             max_decode_seq_len=0,
             context_lens_tensor=context_lens_tensor,
@@ -882,6 +920,7 @@ def make_test_metadata(
             num_encoder_tokens=num_encoder_tokens,
             encoder_seq_lens=encoder_seq_lens,
             encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            encoder_seq_start_loc=encoder_seq_start_loc,
             max_encoder_seq_len=max_encoder_seq_len,
             cross_slot_mapping=(None if cross_kv_mmap is None else
                                 cross_kv_mmap.slot_mapping),
@@ -904,8 +943,9 @@ def make_test_metadata(
             context_lens_tensor,
             _,
             _,
-            _,
+            seq_start_loc,
             encoder_seq_lens_tensor,
+            encoder_seq_start_loc,
             max_encoder_seq_len,
         ) = _make_metadata_tensors(seq_lens,
                                    context_lens,
@@ -920,14 +960,17 @@ def make_test_metadata(
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
+            seq_start_loc=seq_start_loc,
             max_prefill_seq_len=0,
             max_decode_seq_len=max(seq_lens),
+            max_decode_query_len=1,
             context_lens_tensor=context_lens_tensor,
             block_tables=kv_mmap.block_tables,
             use_cuda_graph=False,
             num_encoder_tokens=num_encoder_tokens,
             encoder_seq_lens=encoder_seq_lens,
             encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            encoder_seq_start_loc=encoder_seq_start_loc,
             max_encoder_seq_len=max_encoder_seq_len,
             cross_slot_mapping=(None if cross_kv_mmap is None else
                                 cross_kv_mmap.slot_mapping),
@@ -936,7 +979,8 @@ def make_test_metadata(
 
 
 def assert_actual_matches_ideal(test_params: PhaseTestParameters,
-                                output_under_test: torch.Tensor) -> None:
+                                output_under_test: torch.Tensor,
+                                backend: str) -> None:
     '''
     Assert that observed output matches the ideal output
     contained in the test parameters data structure.
@@ -947,8 +991,22 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
     * output_under_test: actually observed output value
     '''
     ideal_output = test_params.packed_qkvo.ideal_output
-    torch.testing.assert_close(ideal_output,
-                               output_under_test.view_as(ideal_output))
+    if backend == 'XFORMERS':
+        torch.testing.assert_close(ideal_output,
+                                   output_under_test.view_as(ideal_output))
+
+    elif backend == 'FLASH_ATTN':
+        # For FlashAttention override the accuracy thresholds to non default
+        # values since we notice a higher difference between the ideal and
+        # actual output.
+        torch.testing.assert_close(ideal_output,
+                                   output_under_test.view_as(ideal_output),
+                                   atol=0.01,
+                                   rtol=0.016)
+    else:
+        raise ValueError(
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+            f"'FLASH_ATTN'.")
 
 
 # Copied/modified from torch._refs.__init__.py
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index 483773f069133..d686f1da3fa17 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -85,7 +85,7 @@ def run_test(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ab363ac78b028..2975a41797e9f 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -10,10 +10,11 @@
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionType)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
-                                           compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
+from vllm.attention.backends.utils import (
+    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
+    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
+    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
+    is_all_encoder_attn_metadata_set, is_block_tables_empty)
 from vllm.forward_context import get_forward_context
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
@@ -73,7 +74,6 @@ def swap_blocks(
         src_key_cache = src_kv_cache[0]
         dst_key_cache = dst_kv_cache[0]
         ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-
         src_value_cache = src_kv_cache[1]
         dst_value_cache = dst_kv_cache[1]
         ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
@@ -85,6 +85,7 @@ def copy_blocks(
     ) -> None:
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
         value_caches = [kv_cache[1] for kv_cache in kv_caches]
+
         ops.copy_blocks(key_caches, value_caches, src_to_dists)
 
 
@@ -111,26 +112,12 @@ class FlashAttentionMetadata(AttentionMetadata):
     # |-------------------- seq_len ---------------------|
     #                                   |-- query_len ---|
 
-    # Maximum query length in the batch.
-    max_query_len: Optional[int]
-
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int]
-
     # Maximum sequence length among prefill batch. 0 if there are decoding
     # requests only.
     max_prefill_seq_len: int
     # Maximum sequence length among decode batch. 0 if there are prefill
     # requests only.
     max_decode_seq_len: int
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
     context_lens_tensor: Optional[torch.Tensor]
@@ -146,11 +133,62 @@ class FlashAttentionMetadata(AttentionMetadata):
     # Whether or not if cuda graph is enabled.
     # Cuda-graph is currently enabled for decoding only.
     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
     use_cuda_graph: bool
 
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
     _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None
     _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return is_all_encoder_attn_metadata_set(self)
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return is_all_cross_attn_metadata_set(self)
+
     @property
     def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
         if self.num_prefills == 0:
@@ -159,32 +197,52 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
         if self._cached_prefill_metadata is not None:
             return self._cached_prefill_metadata
 
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
-        assert self.block_tables is not None
-        assert self.seq_start_loc is not None
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
 
         self._cached_prefill_metadata = FlashAttentionMetadata(
             num_prefills=self.num_prefills,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
-            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
-            seq_lens=self.seq_lens[:self.num_prefills],
-            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_query_len=0,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
-            block_tables=self.block_tables[:self.num_prefills],
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
             use_cuda_graph=False,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_prefill_metadata
 
     @property
@@ -194,17 +252,25 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
 
         if self._cached_decode_metadata is not None:
             return self._cached_decode_metadata
-        assert self.block_tables is not None
-        assert self.seq_lens_tensor is not None
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
 
         self._cached_decode_metadata = FlashAttentionMetadata(
             num_prefills=0,
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
             seq_lens=None,
-            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
+            seq_lens_tensor=seq_lens_tensor,
             max_decode_query_len=self.max_decode_query_len,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=0,
@@ -214,9 +280,15 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             seq_start_loc=self.seq_start_loc[self.num_prefills:]
             if self.seq_start_loc is not None else None,
             context_lens_tensor=None,
-            block_tables=self.block_tables[self.num_prefills:],
+            block_tables=block_tables,
             use_cuda_graph=self.use_cuda_graph,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
     def advance_step(self,
@@ -586,16 +658,20 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttentionImpl")
-
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
         output = torch.ops.vllm.unified_flash_attention(
             query,
             key,
@@ -608,6 +684,7 @@ def forward(
             k_scale,
             v_scale,
             self.scale,
+            attn_type.value,
             self.sliding_window,
             self.alibi_slopes,
             self.logits_soft_cap,
@@ -616,6 +693,89 @@ def forward(
         return output
 
 
+def _get_query_key_seq_metadata(
+    attn_metadata,
+    is_prompt: bool,
+    attn_type: AttentionType,
+) -> tuple:
+    """
+    Returns sequence metadata for key and query based on the specified 
+    attention type and whether input is a prompt.
+
+    This function computes the starting locations and maximum sequence lengths 
+    for key and query sequences for different attention types.
+
+    Args:
+        attn_metadata: The attention metadata object
+        is_prompt (bool): A flag indicating if the input is a prompt
+        attn_type (AttentionType): The type of attention being used.
+
+    Returns:
+        tuple: A tuple containing four integers:
+            - Starting location for the query sequence.
+            - Maximum sequence length for the query sequence.
+            - Starting location for the key sequence.
+            - Maximum sequence length for the key sequence.
+
+    Raises:
+        AttributeError: If an invalid attention type is provided.
+    """
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_start_loc, max_seq_len,
+                attn_metadata.seq_start_loc, max_seq_len)
+
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # This is cross attention between the where the key
+        # is the precomputed encoder attention and query
+        # is the input sequence.
+        # Choose query max length based on whether it is prompt
+        # or not.
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_start_loc, max_seq_len,
+                attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len)
+    elif attn_type == AttentionType.ENCODER:
+        # For encoder attention both the query and the key are same i.e the
+        # encoder sequence.
+        return (attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len)
+    elif attn_type == AttentionType.ENCODER_ONLY:
+        assert is_prompt, "Should not have decode for encoder only model."
+        return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len,
+                attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def _get_causal_option(attn_type: AttentionType) -> bool:
+    """
+    Determine whether the given attention type is suitable for causal 
+    attention mechanisms.
+
+    Args:
+        attn_type (AttentionType): The type of attention being evaluated
+
+    Returns:
+        bool: Returns `True` if the attention type is suitable for causal 
+        attention (i.e., not encoder, encoder-only, or encoder-decoder), 
+        otherwise returns `False`.
+    """
+    return not (attn_type == AttentionType.ENCODER
+                or attn_type == AttentionType.ENCODER_ONLY
+                or attn_type == AttentionType.ENCODER_DECODER)
+
+
 def unified_flash_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -628,60 +788,76 @@ def unified_flash_attention(
     k_scale: float,
     v_scale: float,
     softmax_scale: float,
+    attn_type_int_val: int,
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
 
+    # Convert integer attn_type to enum
+    try:
+        attn_type = AttentionType(attn_type_int_val)
+    except ValueError as err:
+        raise AttributeError(
+            f"Invalid attention type {str(attn_type_int_val)}") from err
+
     current_metadata = get_forward_context()
     assert current_metadata is not None
     assert isinstance(current_metadata, FlashAttentionMetadata)
     attn_metadata: FlashAttentionMetadata = current_metadata
 
     num_tokens, hidden_size = query.shape
+
     # Reshape the query, key, and value tensors.
     query = query.view(-1, num_heads, head_size)
-    key = key.view(-1, num_kv_heads, head_size)
-    value = value.view(-1, num_kv_heads, head_size)
+    if (key is not None) and (value is not None):
+        key = key.view(-1, num_kv_heads, head_size)
+        value = value.view(-1, num_kv_heads, head_size)
 
     if kv_cache.numel() > 0:
         key_cache = kv_cache[0]
         value_cache = kv_cache[1]
+        # We skip updating the KV cache under two conditions:
+        #  a. When the Attention Type is ENCODER. In this phase, we compute
+        #     only the encoder attention without updating the cache.
+        #  b. When both Key and Value are None. This occurs during
+        #     cross-attention computation in the decoding phase, where the KV
+        #     cache is already populated with the cross-attention tensor.
+        #     Thus, we skip cache updates during this time.
+        if (attn_type != AttentionType.ENCODER) and (key is not None) and (
+                value is not None):
+            if attn_type == AttentionType.ENCODER_DECODER:
+                # Update cross-attention KV cache (prefill-only)
+                updated_slot_mapping = attn_metadata.cross_slot_mapping
+            else:
+                # Update self-attention KV cache (prefill/decode)
+                updated_slot_mapping = attn_metadata.slot_mapping
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[0],
+                kv_cache[1],
+                updated_slot_mapping.flatten(),  # type: ignore[union-attr]
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
 
-        # Reshape the input keys and values and store them in the cache.
-        # If kv_cache is not provided, the new key and value tensors are
-        # not cached. This happens during the initial memory profiling run.
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            kv_cache[0],
-            kv_cache[1],
-            attn_metadata.slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-
-    num_prefill_tokens = attn_metadata.num_prefill_tokens
-    num_decode_tokens = attn_metadata.num_decode_tokens
-    assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
-    assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
-
-    # Query for decode. KV is not needed because it is already cached.
-    decode_query = query[num_prefill_tokens:]
+    (num_prefill_query_tokens, num_prefill_kv_tokens,
+    num_decode_query_tokens) = \
+        get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
+    decode_query = query[num_prefill_query_tokens:]
     # QKV for prefill.
-    query = query[:num_prefill_tokens]
-    key = key[:num_prefill_tokens]
-    value = value[:num_prefill_tokens]
-
-    assert query.shape[0] == num_prefill_tokens
-    assert decode_query.shape[0] == num_decode_tokens
+    query = query[:num_prefill_query_tokens]
+    assert query.shape[0] == num_prefill_query_tokens
+    assert decode_query.shape[0] == num_decode_query_tokens
 
     prefill_output: Optional[torch.Tensor] = None
     decode_output: Optional[torch.Tensor] = None
-
     if prefill_meta := attn_metadata.prefill_metadata:
         # Prompt run.
         if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
@@ -689,22 +865,30 @@ def unified_flash_attention(
             # normal attention
             # When block_tables are not filled, it means q and k are the
             # prompt, and they have the same length.
+            q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
+                _get_query_key_seq_metadata(prefill_meta, True, attn_type)
+
+            key = key[:num_prefill_kv_tokens]
+            value = value[:num_prefill_kv_tokens]
+
             prefill_output = flash_attn_varlen_func(
                 q=query,
                 k=key,
                 v=value,
-                cu_seqlens_q=prefill_meta.seq_start_loc,
-                cu_seqlens_k=prefill_meta.seq_start_loc,
-                max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                cu_seqlens_q=q_seq_start_loc,
+                cu_seqlens_k=k_seq_start_loc,
+                max_seqlen_q=q_seq_len,
+                max_seqlen_k=k_seq_len,
                 softmax_scale=softmax_scale,
-                causal=True,
+                causal=_get_causal_option(attn_type),
                 window_size=window_size,
                 alibi_slopes=alibi_slopes,
                 softcap=logits_soft_cap,
             )
         else:
             # prefix-enabled attention
+            assert attn_type == AttentionType.DECODER, (
+                "Only decoder-only models support prefix caching")
             assert prefill_meta.seq_lens is not None
             max_seq_len = max(prefill_meta.seq_lens)
             prefill_output = flash_attn_varlen_func(  # noqa
@@ -729,6 +913,8 @@ def unified_flash_attention(
         # because different queries might have different lengths.
         assert decode_meta.max_decode_query_len is not None
         if decode_meta.max_decode_query_len > 1:
+            assert attn_type == AttentionType.DECODER, (
+                "Only decoder-only models support max_decode_query_len > 1")
             decode_output = flash_attn_varlen_func(
                 q=decode_query,
                 k=key_cache,
@@ -746,12 +932,17 @@ def unified_flash_attention(
             )
         else:
             # Use flash_attn_with_kvcache for normal decoding.
+            (
+                seq_lens_arg,
+                _,
+                block_tables_arg,
+            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
             decode_output = flash_attn_with_kvcache(
                 q=decode_query.unsqueeze(1),
                 k_cache=key_cache,
                 v_cache=value_cache,
-                block_table=decode_meta.block_tables,
-                cache_seqlens=decode_meta.seq_lens_tensor,
+                block_table=block_tables_arg,
+                cache_seqlens=seq_lens_arg,
                 softmax_scale=softmax_scale,
                 causal=True,
                 window_size=window_size,
@@ -761,10 +952,10 @@ def unified_flash_attention(
 
     if prefill_output is None:
         assert decode_output is not None
-        return decode_output.view(num_decode_tokens, hidden_size)
+        return decode_output.view(num_decode_query_tokens, hidden_size)
     if decode_output is None:
         assert prefill_output is not None
-        return prefill_output.view(num_prefill_tokens, hidden_size)
+        return prefill_output.view(num_prefill_query_tokens, hidden_size)
 
     # Chunked prefill does not work with speculative decoding.
     # Therefore, the query length for decode should be 1 in chunked prefill.
@@ -786,6 +977,7 @@ def unified_flash_attention_fake(
     k_scale: float,
     v_scale: float,
     softmax_scale: float,
+    attn_type_int_val: int,
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 55293bbb06e1d..096c920c4833a 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,13 +1,14 @@
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Dict, List, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
 
 import numpy as np
 import torch
 
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
+from vllm.attention.backends.abstract import AttentionType
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
@@ -336,11 +337,13 @@ def graph_capture_get_metadata_for_batch(
             use_cuda_graph=True,
         )
         if is_encoder_decoder_model:
-            # The encoder decoder model works only with XFormers backend.
-            # Assert the same.
-            assert self.runner.attn_backend.get_name() == "XFORMERS", \
-            f"Expected attn_backend name to be 'XFORMERS', but "\
-            f" got '{self.runner.attn_backend.get_name()}'"
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in\
+                ["XFORMERS", "FLASH_ATTN"], \
+                f"Expected attn_backend name to be either 'XFORMERS' or " \
+                f"'FLASH_ATTN', but "\
+                f"got '{self.runner.attn_backend.get_name()}'"
             self._update_captured_metadata_for_enc_dec_model(
                 batch_size=batch_size, attn_metadata=attn_metadata)
 
@@ -356,11 +359,13 @@ def get_graph_input_buffers(
             "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         if is_encoder_decoder_model:
-            # The encoder decoder model works only with XFormers backend.
-            # Assert the same.
-            assert self.runner.attn_backend.get_name() == "XFORMERS", \
-            f"Expected attn_backend name to be 'XFORMERS', but "\
-            f" got '{self.runner.attn_backend.get_name()}'"
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in\
+                ["XFORMERS", "FLASH_ATTN"], \
+                f"Expected attn_backend name to be either 'XFORMERS' or "\
+                f"'FLASH_ATTN', but "\
+                f"got '{self.runner.attn_backend.get_name()}'"
             self._add_additonal_input_buffers_for_enc_dec_model(
                 attn_metadata=attn_metadata, input_buffers=input_buffers)
         return input_buffers
@@ -375,11 +380,13 @@ def prepare_graph_input_buffers(
         input_buffers["block_tables"].copy_(
             attn_metadata.decode_metadata.block_tables, non_blocking=True)
         if is_encoder_decoder_model:
-            # The encoder decoder model works only with XFormers backend.
-            # Assert the same.
-            assert self.runner.attn_backend.get_name() == "XFORMERS", \
-            f"Expected attn_backend name to be 'XFORMERS', but "\
-            f" got '{self.runner.attn_backend.get_name()}'"
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in\
+                ["XFORMERS", "FLASH_ATTN"], \
+                f"Expected attn_backend name to be either 'XFORMERS' or "\
+                f"'FLASH_ATTN', but "\
+                f"got '{self.runner.attn_backend.get_name()}'"
             self._prepare_input_buffers_for_enc_dec_model(
                 attn_metadata, input_buffers)
 
@@ -411,6 +418,7 @@ def _update_captured_metadata_for_enc_dec_model(self, batch_size: int,
         attn_metadata.encoder_seq_lens_tensor = torch.full(
             (batch_size, ), 1, dtype=torch.int).cuda()
         attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
+        attn_metadata.num_encoder_tokens = 0
 
     def _add_additonal_input_buffers_for_enc_dec_model(
             self, attn_metadata, input_buffers: Dict[str, Any]):
@@ -453,3 +461,122 @@ def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata,
         input_buffers["cross_block_tables"].copy_(
             attn_metadata.decode_metadata.cross_block_tables,
             non_blocking=True)
+
+
+def is_all_encoder_attn_metadata_set(attn_metadata):
+    '''
+    All attention metadata required for encoder attention is set.
+    '''
+    return ((attn_metadata.encoder_seq_lens is not None)
+            and (attn_metadata.encoder_seq_lens_tensor is not None)
+            and (attn_metadata.max_encoder_seq_len is not None))
+
+
+def is_all_cross_attn_metadata_set(attn_metadata):
+    '''
+    All attention metadata required for enc/dec cross-attention is set.
+
+    Superset of encoder attention required metadata.
+    '''
+    return (attn_metadata.is_all_encoder_attn_metadata_set
+            and (attn_metadata.cross_slot_mapping is not None)
+            and (attn_metadata.cross_block_tables is not None))
+
+
+def get_seq_len_block_table_args(
+    attn_metadata,
+    is_prompt: bool,
+    attn_type: AttentionType,
+) -> tuple:
+    '''
+    The particular choice of sequence-length- and block-table-related
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths & 
+                                  cross-attn block-tables fields
+    Encoder attn -> select encoder sequence lengths fields & no block tables
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * is_prompt: True if prefill, False otherwise
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensor
+    * Appropriate max sequence-length scalar
+    * Appropriate block tables (or None)
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_lens_tensor, max_seq_len,
+                attn_metadata.block_tables)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.cross_block_tables)
+    elif attn_type == AttentionType.ENCODER:
+        # No block tables associated with encoder attention
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len, None)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def get_num_prefill_decode_query_kv_tokens(
+    attn_metadata,
+    attn_type: AttentionType,
+) -> Tuple[int, int, int]:
+    """
+    Calculate the number of prefill and decode tokens for query, key/value
+    based on the attention metadata and the specified attention type.
+
+    Args:
+        attn_metadata (FlashAttentionMetadata): Attention Metadata object.
+        attn_type (AttentionType): The type of attention being used.
+    Returns:
+        Tuple[int, int, int]: A tuple containing three integers:
+            - The number of prefill query tokens.
+            - The number of prefill key/value tokens.
+            - The number of decode query tokens.
+
+    Raises:
+        AssertionError: If the number of encoder tokens in `attn_metadata` 
+        is `None` when required for the calculations.
+    """
+    num_prefill_query_tokens = 0
+    num_decode_query_tokens = 0
+    num_prefill_kv_tokens = 0
+    if attn_type == AttentionType.ENCODER:
+        # Encoder attention is only invoked during prefill phase.
+        # The same input servers a both query and key.
+        assert attn_metadata.num_encoder_tokens is not None
+        num_prefill_query_tokens = attn_metadata.num_encoder_tokens
+        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+        num_decode_query_tokens = 0
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        assert attn_metadata.num_encoder_tokens is not None
+        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+        # The key is the encoder/cross-attention.
+        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+        num_decode_query_tokens = attn_metadata.num_decode_tokens
+    else:  # attn_type == AttentionType.DECODER or
+        # attn_type == AttentionType.ENCODER_ONLY
+        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+        num_prefill_kv_tokens = attn_metadata.num_prefill_tokens
+        num_decode_query_tokens = attn_metadata.num_decode_tokens
+
+    return (num_prefill_query_tokens, num_prefill_kv_tokens,
+            num_decode_query_tokens)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 21877f2dded0e..4725413baade7 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -11,8 +11,10 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
-from vllm.attention.backends.utils import (CommonAttentionState,
-                                           CommonMetadataBuilder)
+from vllm.attention.backends.utils import (
+    CommonAttentionState, CommonMetadataBuilder,
+    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
+    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -135,6 +137,11 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     # Encoder sequence lengths representation
     encoder_seq_lens: Optional[List[int]] = None
     encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # FIXME: It is for flash attn.
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
 
     # Maximum sequence length among encoder sequences
     max_encoder_seq_len: Optional[int] = None
@@ -162,9 +169,7 @@ def is_all_encoder_attn_metadata_set(self):
         '''
         All attention metadata required for encoder attention is set.
         '''
-        return ((self.encoder_seq_lens is not None)
-                and (self.encoder_seq_lens_tensor is not None)
-                and (self.max_encoder_seq_len is not None))
+        return is_all_encoder_attn_metadata_set(self)
 
     @property
     def is_all_cross_attn_metadata_set(self):
@@ -173,9 +178,7 @@ def is_all_cross_attn_metadata_set(self):
 
         Superset of encoder attention required metadata.
         '''
-        return (self.is_all_encoder_attn_metadata_set
-                and (self.cross_slot_mapping is not None)
-                and (self.cross_block_tables is not None))
+        return is_all_cross_attn_metadata_set(self)
 
     @property
     def prefill_metadata(self) -> Optional["XFormersMetadata"]:
@@ -329,64 +332,6 @@ def _set_attn_bias(
         raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
 
-def _get_seq_len_block_table_args(
-    attn_metadata: XFormersMetadata,
-    is_prompt: bool,
-    attn_type: AttentionType,
-) -> tuple:
-    '''
-    The particular choice of sequence-length- and block-table-related
-    attributes which should be extracted from attn_metadata is dependent
-    on the type of attention operation.
-
-    Decoder attn -> select entirely decoder self-attention-related fields
-    Encoder/decoder cross-attn -> select encoder sequence lengths & 
-                                  cross-attn block-tables fields
-    Encoder attn -> select encoder sequence lengths fields & no block tables
-    
-    Arguments:
-
-    * attn_metadata: Attention metadata structure associated with attention op
-    * is_prompt: True if prefill, False otherwise
-    * attn_type: encoder attention, decoder self-attention,
-                 encoder/decoder cross-attention
-
-    Returns:
-
-    * Appropriate sequence-lengths tensor
-    * Appropriate max sequence-length scalar
-    * Appropriate block tables (or None)
-    '''
-
-    if attn_type == AttentionType.DECODER:
-        # Decoder self-attention
-        # Choose max_seq_len based on whether we are in prompt_run
-        if is_prompt:
-            max_seq_len = attn_metadata.max_prefill_seq_len
-        else:
-            max_seq_len = attn_metadata.max_decode_seq_len
-        return (attn_metadata.seq_lens_tensor, max_seq_len,
-                attn_metadata.block_tables)
-    elif attn_type == AttentionType.ENCODER_DECODER:
-        # Enc/dec cross-attention KVs match encoder sequence length;
-        # cross-attention utilizes special "cross" block tables
-        return (attn_metadata.encoder_seq_lens_tensor,
-                attn_metadata.max_encoder_seq_len,
-                attn_metadata.cross_block_tables)
-    elif attn_type == AttentionType.ENCODER:
-        # No block tables associated with encoder attention
-        return (attn_metadata.encoder_seq_lens_tensor,
-                attn_metadata.max_encoder_seq_len, None)
-    elif attn_type == AttentionType.ENCODER_ONLY:
-        assert is_prompt, "Should not have decode for encoder only model."
-
-        # No block tables associated with encoder attention
-        return (attn_metadata.seq_lens_tensor,
-                attn_metadata.max_prefill_seq_len, None)
-    else:
-        raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
 class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]):
 
     _metadata_cls = XFormersMetadata
@@ -574,45 +519,21 @@ def forward(
                                                     updated_slot_mapping,
                                                     self.kv_cache_dtype,
                                                     k_scale, v_scale)
-
-        if attn_type == AttentionType.ENCODER:
-            # Encoder attention - chunked prefill is not applicable;
-            # derive token-count from query shape & and treat them
-            # as 100% prefill tokens
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-            num_encoder_tokens = attn_metadata.num_encoder_tokens
-            num_decode_tokens = 0
-        elif attn_type == AttentionType.DECODER:
-            # Decoder self-attention supports chunked prefill.
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            num_encoder_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
-            # Only enforce this shape-constraint for decoder
-            # self-attention
-            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
-        else:  # attn_type == AttentionType.ENCODER_DECODER
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            if attn_metadata.num_encoder_tokens is not None:
-                num_encoder_tokens = attn_metadata.num_encoder_tokens
-            else:
-                num_encoder_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
+        (num_prefill_query_tokens, num_prefill_kv_tokens,
+        num_decode_query_tokens) = \
+            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
-        decode_query = query[num_prefill_tokens:]
+        decode_query = query[num_prefill_query_tokens:]
         # QKV for prefill.
-        query = query[:num_prefill_tokens]
+        query = query[:num_prefill_query_tokens]
         if key is not None and value is not None:
-            key = key[:num_encoder_tokens]
-            value = value[:num_encoder_tokens]
+            key = key[:num_prefill_kv_tokens]
+            value = value[:num_prefill_kv_tokens]
 
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
+        assert query.shape[0] == num_prefill_query_tokens
+        assert decode_query.shape[0] == num_decode_query_tokens
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
@@ -622,8 +543,8 @@ def forward(
                 # prefix.
                 out = self._run_memory_efficient_xformers_forward(
                     query, key, value, prefill_meta, attn_type=attn_type)
-                assert out.shape == output[:num_prefill_tokens].shape
-                output[:num_prefill_tokens] = out
+                assert out.shape == output[:num_prefill_query_tokens].shape
+                output[:num_prefill_query_tokens] = out
             else:
                 assert attn_type != AttentionType.ENCODER_ONLY, (
                     "Encoder-only models should not have prefix attention.")
@@ -652,8 +573,8 @@ def forward(
                     k_scale,
                     v_scale,
                 )
-                assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
+                assert output[:num_prefill_query_tokens].shape == out.shape
+                output[:num_prefill_query_tokens] = out
 
         if decode_meta := attn_metadata.decode_metadata:
             assert attn_type != AttentionType.ENCODER_ONLY, (
@@ -663,9 +584,9 @@ def forward(
                 seq_lens_arg,
                 max_seq_len_arg,
                 block_tables_arg,
-            ) = _get_seq_len_block_table_args(decode_meta, False, attn_type)
+            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
 
-            output[num_prefill_tokens:] = PagedAttention.forward_decode(
+            output[num_prefill_query_tokens:] = PagedAttention.forward_decode(
                 decode_query,
                 key_cache,
                 value_cache,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 376b3136f0fb8..8a59cf41a689e 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -98,7 +98,6 @@ def get_attn_backend(
     is_blocksparse: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
-
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
         from vllm.attention.backends.blocksparse_attn import (
@@ -108,6 +107,7 @@ def get_attn_backend(
     backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
                                 is_attention_free)
     if backend == _Backend.FLASH_ATTN:
+        logger.info("Using Flash Attention backend.")
         from vllm.attention.backends.flash_attn import (  # noqa: F401
             FlashAttentionBackend)
         return FlashAttentionBackend
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index cbdacf779b089..0543ca978b7dd 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -624,8 +624,6 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
             Decoder output torch.Tensor
         """
         # retrieve input_ids and inputs_embeds
-
-        input_ids = input_ids.view(-1, input_ids.shape[-1])
         inputs_embeds = self.embed_tokens(input_ids)
 
         embed_pos = self.embed_positions(
diff --git a/vllm/utils.py b/vllm/utils.py
index 5488719cc99b0..1041120a24b3f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -80,8 +80,8 @@
                                  "currently supported with encoder/"
                                  "decoder models.")
 
-STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers is the only backend "
-                                "currently supported with encoder/"
+STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only "
+                                "backends currently supported with encoder/"
                                 "decoder models.")
 
 STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not "
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index a4b665d71f28a..2ea314f8608ee 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -19,6 +19,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.utils import get_architecture_class_name
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
@@ -36,6 +37,11 @@
 
 logger = init_logger(__name__)
 
+# The Mllama model has PagedAttention specific logic because of which it
+# can only be run with the XFORMERS backend
+# TODO Make Mllama model work with Flash Attention backend.
+_XFORMERS_ONLY_ENCODER_DECODER_ARCHS = ["MllamaForConditionalGeneration"]
+
 
 @dataclasses.dataclass(frozen=True)
 class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
@@ -101,9 +107,7 @@ def __init__(
         models) but these arguments are present here for compatibility with 
         the base-class constructor.
         '''
-
-        self._maybe_force_supported_attention_backend()
-
+        self._maybe_force_supported_attention_backend(model_config)
         super().__init__(
             model_config,
             parallel_config,
@@ -119,7 +123,12 @@ def __init__(
         # Crash for unsupported encoder/scenarios
         assert_enc_dec_mr_supported_scenario(self)
 
-    def _maybe_force_supported_attention_backend(self):
+    def _is_xformers_only_encoder_decoder_model(self,
+                                                model: ModelConfig) -> bool:
+        return get_architecture_class_name(
+            model) in _XFORMERS_ONLY_ENCODER_DECODER_ARCHS
+
+    def _maybe_force_supported_attention_backend(self, model: ModelConfig):
         '''
         Force vLLM to use the XFormers attention backend,
         which is currently the only supported option.
@@ -135,22 +144,26 @@ def raise_backend_err():
         is_forced_by_global = maybe_global_forced_backend is not None
         is_forced_by_env_var = maybe_env_var_forced_backend is not None
 
-        if not (is_forced_by_global or is_forced_by_env_var):
+        if not (is_forced_by_global or is_forced_by_env_var) \
+            and self._is_xformers_only_encoder_decoder_model(model):
             # The user has not already specified an attention backend
             # override
-            logger.info("EncoderDecoderModelRunner requires "
-                        "XFormers backend; overriding backend "
-                        "auto-selection and forcing XFormers.")
+            logger.info(
+                "Encoder-Decoder Model Architecture %s requires XFormers "
+                "backend; overriding backend auto-selection and "
+                "forcing XFormers.", get_architecture_class_name(model))
             global_force_attn_backend(_Backend.XFORMERS)
         elif is_forced_by_global:
             # Backend override enforced by global variable takes
             # precedence over vLLM backend environment variable.
-            if maybe_global_forced_backend != _Backend.XFORMERS:
+            if maybe_global_forced_backend not in\
+                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
                 raise_backend_err()
         elif is_forced_by_env_var:
             # Backend override enforced by vLLM backend
             # environment variable
-            if maybe_env_var_forced_backend != _Backend.XFORMERS:
+            if maybe_env_var_forced_backend not in\
+                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
                 raise_backend_err()
 
     def _list_to_int32_tensor(
@@ -532,6 +545,7 @@ def _prepare_encoder_model_input_tensors(
             attn_metadata.encoder_seq_lens,
             attn_metadata.encoder_seq_lens_tensor,
             attn_metadata.max_encoder_seq_len,
+            attn_metadata.encoder_seq_start_loc,
             attn_metadata.cross_slot_mapping,
             attn_metadata.cross_block_tables,
         ) = (
@@ -539,6 +553,7 @@ def _prepare_encoder_model_input_tensors(
             encoder_seq_lens,
             encoder_seq_lens_tensor,
             max_encoder_seq_len,
+            encoder_seq_start_loc,
             cross_slot_mapping_tensor,
             cross_block_tables,
         )

From af7380d83b0d67726a4a6c7a86766423bed6a7a8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 1 Nov 2024 23:35:47 -0700
Subject: [PATCH 21/43] [torch.compile] fix cpu broken code (#9947)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 1041120a24b3f..a742ec8d76908 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1551,7 +1551,14 @@ def direct_register_custom_op(
     """
     if is_in_doc_build():
         return
-    schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    import torch.library
+    if hasattr(torch.library, "infer_schema"):
+        schema_str = torch.library.infer_schema(op_func,
+                                                mutates_args=mutates_args)
+    else:
+        # for pytorch 2.4
+        import torch._custom_op.impl
+        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
     my_lib.define(op_name + schema_str)
     my_lib.impl(op_name, op_func, "CUDA")

From eed92f12fc829ff074e7341283cb1677b7e65aa2 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sat, 2 Nov 2024 09:02:18 +0000
Subject: [PATCH 22/43] [Docs] Update Granite 3.0 models in supported models
 table (#9930)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 80714a90df5c2..a5c085bb84db9 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -160,13 +160,13 @@ Text Generation
     -
     - ✅︎
   * - :code:`GraniteForCausalLM`
-    - PowerLM
-    - :code:`ibm/PowerLM-3b` etc.
+    - Granite 3.0, PowerLM
+    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
     - ✅︎
     - ✅︎
   * - :code:`GraniteMoeForCausalLM`
-    - PowerMoE
-    - :code:`ibm/PowerMoE-3b` etc.
+    - Granite 3.0 MoE, PowerMoE
+    - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
     - ✅︎
     - ✅︎
   * - :code:`InternLMForCausalLM`

From 1d4cfe2be1907408d610489bdca7bc8f8d2345b1 Mon Sep 17 00:00:00 2001
From: Michael Green <59619482+mikegre-google@users.noreply.github.com>
Date: Sat, 2 Nov 2024 14:06:45 +0000
Subject: [PATCH 23/43] [Doc] Updated tpu-installation.rst with more details
 (#9926)

Signed-off-by: Michael Green <mikegre@google.com>
---
 .../getting_started/tpu-installation.rst      | 158 ++++++++++++++++--
 1 file changed, 144 insertions(+), 14 deletions(-)

diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index edba209986f6a..f0c812b941c1f 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -1,35 +1,167 @@
 .. _installation_tpu:
 
+#####################
 Installation with TPU
-=====================
+#####################
 
-vLLM supports Google Cloud TPUs using PyTorch XLA.
+Tensor Processing Units (TPUs) are Google's custom-developed application-specific 
+integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs 
+are available in different versions each with different hardware specifications.
+For more information about TPUs, see `TPU System Architecture <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm>`_. 
+For more information on the TPU versions supported with vLLM, see:
+
+* `TPU v6e <https://cloud.google.com/tpu/docs/v6e>`_
+* `TPU v5e <https://cloud.google.com/tpu/docs/v5e>`_
+* `TPU v5p <https://cloud.google.com/tpu/docs/v5p>`_
+* `TPU v4 <https://cloud.google.com/tpu/docs/v4>`_
+
+These TPU versions allow you to configure the physical arrangements of the TPU 
+chips. This can improve throughput and networking performance. For more 
+information see: 
+
+* `TPU v6e topologies <https://cloud.google.com/tpu/docs/v6e#configurations>`_
+* `TPU v5e topologies <https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config>`_
+* `TPU v5p topologies <https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config>`_
+* `TPU v4 topologies <https://cloud.google.com/tpu/docs/v4#tpu-v4-config>`_
+
+In order for you to use Cloud TPUs you need to have TPU quota granted to your 
+Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
+GPC project and are specified in terms of TPU version, the number of TPU you 
+want to use, and quota type. For more information, see `TPU quota <https://cloud.google.com/tpu/docs/quota#tpu_quota>`_. 
+
+For TPU pricing information, see `Cloud TPU pricing <https://cloud.google.com/tpu/pricing>`_.
+
+You may need additional persistent storage for your TPU VMs. For more 
+information, see `Storage options for Cloud TPU data <https://cloud.devsite.corp.google.com/tpu/docs/storage-options>`_.
 
 Requirements
 ------------
 
-* Google Cloud TPU VM (single & multi host)
-* TPU versions: v5e, v5p, v4
-* Python: 3.10
+* Google Cloud TPU VM 
+* TPU versions: v6e, v5e, v5p, v4
+* Python: 3.10 or newer
+
+Provision Cloud TPUs
+====================
+
+You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_` 
+or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_` 
+API. This section shows how to create TPUs using the queued resource API. 
+For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
+`Queued resources <https://cloud.devsite.corp.google.com/tpu/docs/queued-resources>`_
+enable you to request Cloud TPU resources in a queued manner. When you request 
+queued resources, the request is added to a queue maintained by the Cloud TPU 
+service. When the requested resource becomes available, it's assigned to your 
+Google Cloud project for your immediate exclusive use. 
+
+Provision a Cloud TPU with the queued resource API
+--------------------------------------------------
+Create a TPU v5e with 4 TPU chips:
+
+.. code-block:: console
+
+    gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
+    --node-id TPU_NAME \
+    --project PROJECT_ID \
+    --zone ZONE \
+    --accelerator-type ACCELERATOR_TYPE \
+    --runtime-version RUNTIME_VERSION \
+    --service-account SERVICE_ACCOUNT
+
+.. list-table:: Parameter descriptions
+    :header-rows: 1
+
+    * - Parameter name
+      - Description
+    * - QUEUED_RESOURCE_ID
+      - The user-assigned ID of the queued resource request.
+    * - TPU_NAME
+      - The user-assigned name of the TPU which is created when the queued 
+        resource request is allocated.
+    * - PROJECT_ID
+      - Your Google Cloud project
+    * - ZONE
+      - The `zone <https://cloud.google.com/tpu/docs/regions-zones>`_ where you 
+        want to create your Cloud TPU.
+    * - ACCELERATOR_TYPE
+      - The TPU version you want to use. Specify the TPU version, followed by a 
+        '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU 
+        with 4 cores. For more information, see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+    * - RUNTIME_VERSION
+      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+    * - SERVICE_ACCOUNT
+      - The email address for your service account. You can find it in the IAM 
+        Cloud Console under *Service Accounts*. For example: 
+        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
+
+Connect to your TPU using SSH:
+
+.. code-block:: bash
+
+    gcloud compute tpus tpu-vm ssh TPU_NAME
+
+Create and activate a Conda environment for vLLM:
+
+.. code-block:: bash
 
-Installation options:
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
 
-1. :ref:`Build a docker image with Dockerfile <build_docker_tpu>`.
-2. :ref:`Build from source <build_from_source_tpu>`.
+Clone the vLLM repository and go to the vLLM directory:
+
+.. code-block:: bash
+
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+
+Uninstall the existing `torch` and `torch_xla` packages:
+
+.. code-block:: bash
+
+    pip uninstall torch torch-xla -y
+
+Install `torch` and `torch_xla`
+
+.. code-block:: bash
+
+    pip install --pre torch==2.6.0.dev20241028+cpu torchvision==0.20.0.dev20241028+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
+    pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html
+
+Install JAX and Pallas:
+
+.. code-block:: bash
+
+    pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+    pip install jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+Install other build dependencies:
+
+.. code-block:: bash
+
+    pip install -r requirements-tpu.txt
+    VLLM_TARGET_DEVICE="tpu" python setup.py develop
+    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
+
+Provision Cloud TPUs with GKE 
+-----------------------------
+
+For more information about using TPUs with GKE, see 
+https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
+https://cloud.google.com/kubernetes-engine/docs/concepts/tpus
+https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus
 
 .. _build_docker_tpu:
 
 Build a docker image with :code:`Dockerfile.tpu`
 ------------------------------------------------
 
-`Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ is provided to build a docker image with TPU support.
+You can use `Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ 
+to build a Docker image with TPU support.
 
 .. code-block:: console
 
     $ docker build -f Dockerfile.tpu -t vllm-tpu .
 
-
-You can run the docker image with the following command:
+Run the Docker image with the following command:
 
 .. code-block:: console
 
@@ -75,14 +207,12 @@ Next, build vLLM from source. This will only take a few seconds:
 
     $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
 
-
 .. note::
 
     Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
     The compilation time may take 20~30 minutes in the first run.
     However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
 
-
 .. tip::
 
     If you encounter the following error:
@@ -93,7 +223,7 @@ Next, build vLLM from source. This will only take a few seconds:
         ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
 
 
-    Please install OpenBLAS with the following command:
+    Install OpenBLAS with the following command:
 
     .. code-block:: console
 

From e8937954434037ac787efa800f01d9d294185439 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 07:35:05 -0700
Subject: [PATCH 24/43] [2/N] executor pass the complete config to
 worker/modelrunner (#9938)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/lora/test_long_context.py               |  8 +--
 tests/lora/test_worker.py                     | 12 +++--
 tests/spec_decode/utils.py                    |  7 +--
 .../test_encoder_decoder_model_runner.py      |  9 +---
 tests/worker/test_model_runner.py             | 10 +---
 tests/worker/test_profile.py                  |  7 +--
 tests/worker/test_swap.py                     |  7 +--
 vllm/config.py                                | 24 ++++-----
 vllm/engine/arg_utils.py                      | 13 ++---
 vllm/engine/async_llm_engine.py               |  8 +--
 vllm/engine/llm_engine.py                     |  9 ++--
 vllm/engine/multiprocessing/client.py         |  4 +-
 vllm/executor/cpu_executor.py                 |  9 +---
 vllm/executor/executor_base.py                |  4 +-
 vllm/executor/gpu_executor.py                 | 11 +---
 vllm/executor/neuron_executor.py              |  6 +--
 vllm/executor/openvino_executor.py            |  8 +--
 vllm/executor/tpu_executor.py                 |  7 +--
 vllm/spec_decode/draft_model_runner.py        | 36 ++-----------
 vllm/spec_decode/ngram_worker.py              |  2 +-
 vllm/spec_decode/spec_decode_worker.py        | 35 ++++++-------
 vllm/spec_decode/target_model_runner.py       | 34 ++++---------
 vllm/v1/engine/llm_engine.py                  |  9 ++--
 vllm/v1/executor/gpu_executor.py              | 11 +---
 vllm/v1/worker/gpu_model_runner.py            | 41 +++++++--------
 vllm/v1/worker/gpu_worker.py                  | 50 +++++++------------
 vllm/worker/cpu_model_runner.py               | 25 +++-------
 vllm/worker/cpu_worker.py                     | 37 ++++----------
 vllm/worker/embedding_model_runner.py         | 26 ++--------
 vllm/worker/enc_dec_model_runner.py           | 25 ++--------
 vllm/worker/model_runner.py                   | 28 +++--------
 vllm/worker/model_runner_base.py              | 17 +++++++
 vllm/worker/multi_step_model_runner.py        |  1 +
 vllm/worker/multi_step_worker.py              | 10 +---
 vllm/worker/neuron_model_runner.py            | 16 ++----
 vllm/worker/neuron_worker.py                  | 20 +++-----
 vllm/worker/openvino_model_runner.py          | 33 +++++-------
 vllm/worker/openvino_worker.py                | 34 +++----------
 vllm/worker/tpu_model_runner.py               | 17 ++-----
 vllm/worker/tpu_worker.py                     | 28 +++--------
 vllm/worker/worker.py                         | 45 ++++-------------
 vllm/worker/worker_base.py                    | 18 ++++++-
 vllm/worker/xpu_model_runner.py               | 29 +++--------
 vllm/worker/xpu_worker.py                     | 40 +++------------
 44 files changed, 250 insertions(+), 580 deletions(-)

diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index c8edb02a88d4b..eada902c891f7 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -138,13 +138,7 @@ def test_rotary_emb_replaced(dist_init):
                              enable_lora=True)
     engine_config = engine_args.create_engine_config()
     model_runner = ModelRunner(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
-        lora_config=engine_config.lora_config,
+        vllm_config=engine_config,
         is_driver_worker=True,
     )
     model_runner.load_model()
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 2f7ac85507425..9d814f657ac43 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -4,7 +4,8 @@
 from unittest.mock import patch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig)
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
@@ -12,7 +13,7 @@
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
-    worker = Worker(
+    vllm_config = VllmConfig(
         model_config=ModelConfig(
             "meta-llama/Llama-2-7b-hf",
             task="auto",
@@ -34,10 +35,13 @@ def test_worker_apply_lora(sql_lora_files):
                                  gpu_memory_utilization=1.,
                                  swap_space=0,
                                  cache_dtype="auto"),
-        local_rank=0,
-        rank=0,
         lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                                max_loras=32),
+    )
+    worker = Worker(
+        vllm_config=vllm_config,
+        local_rank=0,
+        rank=0,
         distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
     )
     worker.init_device()
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index f683942a5854b..6cf0cfb09b8fa 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -81,12 +81,7 @@ def create_worker(cls: Callable[..., T],
         get_ip(), get_open_port())
 
     worker = cls(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
+        vllm_config=engine_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index e75884a7395e2..9e166ae64dbfb 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -19,14 +19,7 @@ def _create_model_runner(model: str, *args,
     engine_args = EngineArgs(model, *args, **kwargs)
     engine_config = engine_args.create_engine_config()
     model_runner = EncoderDecoderModelRunner(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
-        lora_config=engine_config.lora_config,
-        prompt_adapter_config=engine_config.prompt_adapter_config,
+        vllm_config=engine_config,
         is_driver_worker=True,
     )
     return model_runner
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index fe97199bac62d..433a9b30ba57a 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -16,15 +16,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
     engine_args = EngineArgs(model, *args, **kwargs)
     engine_config = engine_args.create_engine_config()
     model_runner = ModelRunner(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
-        lora_config=engine_config.lora_config,
-        prompt_adapter_config=engine_config.prompt_adapter_config,
-        observability_config=engine_config.observability_config,
+        vllm_config=engine_config,
         is_driver_worker=True,
     )
     return model_runner
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index acd2ed6836365..194ea2aa506f4 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -24,12 +24,7 @@ def test_gpu_memory_profiling():
     distributed_init_method = get_distributed_init_method(
         get_ip(), get_open_port())
     worker = Worker(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
+        vllm_config=engine_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 7aa439ba0a154..acede959f59f8 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -19,12 +19,7 @@ def test_swap() -> None:
     distributed_init_method = get_distributed_init_method(
         get_ip(), get_open_port())
     worker = Worker(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
+        vllm_config=engine_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/vllm/config.py b/vllm/config.py
index c2a8c956b374a..17e9b1c100498 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,6 +1,6 @@
 import enum
 import json
-from dataclasses import dataclass, field, fields
+from dataclasses import dataclass, field
 from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal,
                     Mapping, Optional, Set, Tuple, Type, Union)
 
@@ -1941,9 +1941,9 @@ def __post_init__(self):
                 f"installed. Original error:\n{otel_import_error_traceback}")
 
 
-@dataclass(frozen=True)
-class EngineConfig:
-    """Dataclass which contains all engine-related configuration. This
+@dataclass
+class VllmConfig:
+    """Dataclass which contains all vllm-related configuration. This
     simplifies passing around the distinct configurations in the codebase.
     """
 
@@ -1953,11 +1953,11 @@ class EngineConfig:
     scheduler_config: SchedulerConfig
     device_config: DeviceConfig
     load_config: LoadConfig
-    lora_config: Optional[LoRAConfig]
-    speculative_config: Optional[SpeculativeConfig]
-    decoding_config: Optional[DecodingConfig]
-    observability_config: Optional[ObservabilityConfig]
-    prompt_adapter_config: Optional[PromptAdapterConfig]
+    lora_config: Optional[LoRAConfig] = None
+    speculative_config: Optional[SpeculativeConfig] = None
+    decoding_config: Optional[DecodingConfig] = None
+    observability_config: Optional[ObservabilityConfig] = None
+    prompt_adapter_config: Optional[PromptAdapterConfig] = None
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
@@ -1975,9 +1975,3 @@ def __post_init__(self):
         if self.prompt_adapter_config:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
-
-    def to_dict(self):
-        """Return the configs as a dictionary, for use in **kwargs.
-        """
-        return dict(
-            (field.name, getattr(self, field.name)) for field in fields(self))
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b1f0f8b9df925..da06ab186821e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -9,10 +9,11 @@
 
 import vllm.envs as envs
 from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
-                         DeviceConfig, EngineConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, ObservabilityConfig,
-                         ParallelConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig)
+                         DeviceConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
+                         VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -955,7 +956,7 @@ def create_load_config(self) -> LoadConfig:
             ignore_patterns=self.ignore_patterns,
         )
 
-    def create_engine_config(self) -> EngineConfig:
+    def create_engine_config(self) -> VllmConfig:
         # gguf file needs a specific model loader and doesn't use hf_repo
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
@@ -1167,7 +1168,7 @@ def create_engine_config(self) -> EngineConfig:
             or "all" in detailed_trace_modules,
         )
 
-        return EngineConfig(
+        return VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 6aeaf484a22b4..b0fdc67776bbd 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -7,8 +7,8 @@
 from weakref import ReferenceType
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -604,7 +604,7 @@ def __del__(self):
 
     @classmethod
     def _get_executor_cls(
-            cls, engine_config: EngineConfig) -> Type[ExecutorAsyncBase]:
+            cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]:
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
         if isinstance(distributed_executor_backend, type):
@@ -663,7 +663,7 @@ def _get_executor_cls(
     def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
-        engine_config: Optional[EngineConfig] = None,
+        engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e6fe1effb8287..b12d29c4a8503 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -13,8 +13,9 @@
 from typing_extensions import TypeIs, TypeVar
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
@@ -219,7 +220,7 @@ def validate_outputs(
 
     def __init__(
         self,
-        vllm_config: EngineConfig,
+        vllm_config: VllmConfig,
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -500,7 +501,7 @@ def _initialize_kv_caches(self) -> None:
 
     @classmethod
     def _get_executor_cls(cls,
-                          engine_config: EngineConfig) -> Type[ExecutorBase]:
+                          engine_config: VllmConfig) -> Type[ExecutorBase]:
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
         # Initialize the cluster and specify the executor class.
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 6e6630b3ff55f..7f1ca621d91c4 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -13,7 +13,7 @@
 from zmq.asyncio import Socket
 
 from vllm import PoolingParams
-from vllm.config import DecodingConfig, EngineConfig, ModelConfig
+from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 # yapf conflicts with isort for this block
@@ -78,7 +78,7 @@ class MQLLMEngineClient(EngineClient):
             every N seconds, confirming the engine is healthy
     """
 
-    def __init__(self, ipc_path: str, engine_config: EngineConfig,
+    def __init__(self, ipc_path: str, engine_config: VllmConfig,
                  engine_pid: int):
         self.context = zmq.asyncio.Context()
         self._errored_with: Optional[BaseException] = None
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index e32993e0e452e..ab3ebb4e43d18 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -138,18 +138,11 @@ def _create_worker(
         assert self.distributed_init_method is not None
 
         kwargs = dict(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=self.distributed_init_method,
-            lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
-            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=rank == 0,
         )
         wrapper.init_worker(**kwargs)
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 2248eecd1849f..9cba189dd57f9 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
-from vllm.config import EngineConfig
+from vllm.config import VllmConfig
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -20,7 +20,7 @@ class ExecutorBase(ABC):
 
     def __init__(
         self,
-        vllm_config: EngineConfig,
+        vllm_config: VllmConfig,
     ) -> None:
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index ed30d3186a453..c65d0836e5ff7 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -49,21 +49,12 @@ def _get_worker_kwargs(
             distributed_init_method = get_distributed_init_method(
                 get_ip(), get_open_port())
         return dict(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            speculative_config=self.speculative_config,
-            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=(not self.parallel_config)
             or (rank % self.parallel_config.tensor_parallel_size == 0),
-            observability_config=self.observability_config,
         )
 
     def _get_worker_module_and_class(
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index f2fcfa58b26e1..02d37cd7fbf23 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -29,11 +29,7 @@ def _init_worker(self):
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
         self.driver_worker = NeuronWorker(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
+            vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method)
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index d0c0333854dae..d06b0ccb7906e 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -48,16 +48,10 @@ def _init_worker(self):
             get_ip(), get_open_port())
         self.driver_worker = OpenVINOWorker(
             ov_core=self.ov_core,
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 972649dedf33e..e37e8973790db 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -44,12 +44,7 @@ def _get_worker_kwargs(
             distributed_init_method = get_distributed_init_method(
                 get_ip(), get_open_port())
         return dict(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 3aa999fcb9ebb..17cc0ad1a4a3a 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -17,9 +17,6 @@
         "Draft model speculative decoding currently only supports"
         "CUDA and ROCm flash attention backend.") from err
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalInputs
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
@@ -49,40 +46,13 @@ class TP1DraftModelRunner(ModelRunner):
        any broadcasting inside execute_model).
     """
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        return_hidden_states: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
-    ):
-        if return_hidden_states:
+    def __init__(self, *args, **kwargs):
+        if kwargs.get("return_hidden_states"):
             raise ValueError(
                 "return_hidden_states is not supported for TP1DraftModelRunner."
             )
 
-        super().__init__(
-            model_config=model_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            lora_config=lora_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker,
-            prompt_adapter_config=prompt_adapter_config,
-            return_hidden_states=return_hidden_states,
-            observability_config=observability_config,
-        )
+        super().__init__(*args, **kwargs)
 
     def _update_sampling_metadata(self, sampling_metadata, num_seqs,
                                   num_queries):
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index a777e5c3f22a7..debb3b2d5ec30 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -21,7 +21,7 @@ class NGramWorker(NonLLMProposerWorkerBase):
     def __init__(self, *args, **kwargs):
         # Get local_rank/vocab_size from kwargs attribute
         self.local_rank = kwargs["local_rank"]
-        self.vocab_size = kwargs["model_config"].get_vocab_size()
+        self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size()
 
         # Lazy initialization list.
         self._proposer: Top1Proposer
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 9f7ef2f8d851c..a402181b13db8 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,10 +1,11 @@
+import copy
 from collections import defaultdict
 from functools import cached_property
 from typing import Any, Dict, List, Optional, Set, Tuple, Type
 
 import torch
 
-from vllm.config import ParallelConfig, SpeculativeConfig
+from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
@@ -45,8 +46,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     """Helper method that is the entrypoint for Executors which use
     WorkerWrapper. It constructs a SpecDecodeWorker from the speculative config.
     """
-    assert "speculative_config" in kwargs
-    speculative_config: SpeculativeConfig = kwargs.get("speculative_config")
+    vllm_config: VllmConfig = kwargs.get("vllm_config")
+    speculative_config: SpeculativeConfig = vllm_config.speculative_config
     assert speculative_config is not None
 
     draft_worker_kwargs = kwargs.copy()
@@ -58,14 +59,16 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     target_worker.model_runner.disable_logprobs =\
          speculative_config.disable_logprobs
 
+    draft_worker_config = copy.deepcopy(vllm_config)
+    draft_worker_config.model_config = speculative_config.draft_model_config
+    draft_worker_config.parallel_config = speculative_config.draft_parallel_config  # noqa
+    # TODO allow draft-model specific load config.
+
     # Override draft-model specific worker args.
     draft_worker_kwargs.update(
-        model_config=speculative_config.draft_model_config,
-        parallel_config=speculative_config.draft_parallel_config,
+        vllm_config=draft_worker_config,
         ngram_prompt_lookup_max=speculative_config.ngram_prompt_lookup_max,
         ngram_prompt_lookup_min=speculative_config.ngram_prompt_lookup_min,
-        # TODO allow draft-model specific load config.
-        #load_config=load_config,
     )
 
     spec_decode_worker = SpecDecodeWorker.create_worker(
@@ -134,29 +137,27 @@ def create_worker(
             draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
         ngram_prompt_lookup_min = (
             draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
+        draft_model_config = draft_worker_kwargs["vllm_config"].model_config
+        draft_parallel_config: ParallelConfig = draft_worker_kwargs[
+            'vllm_config'].parallel_config
         if ngram_prompt_lookup_max > 0:
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
         else:
-            draft_parallel_config: ParallelConfig = draft_worker_kwargs[
-                'parallel_config']
             draft_tp = draft_parallel_config.tensor_parallel_size
             target_tp = scorer_worker.parallel_config.tensor_parallel_size
 
-            if draft_worker_kwargs[
-                    "model_config"].hf_config.model_type == "mlp_speculator":
+            if draft_model_config.hf_config.model_type == "mlp_speculator":
                 proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
-            elif draft_worker_kwargs[
-                    "model_config"].hf_config.model_type == "medusa":
+            elif draft_model_config.hf_config.model_type == "medusa":
                 proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
                 if draft_tp == 1:
                     draft_worker_kwargs[
                         "model_runner_cls"] = TP1DraftModelRunner
                 else:
-                    if draft_worker_kwargs[
-                            "model_config"].hf_config.model_type == "eagle":
+                    if draft_model_config.hf_config.model_type == "eagle":
                         raise NotImplementedError(
                             "EAGLE does not support TP > 1 yet")
 
@@ -190,8 +191,8 @@ def create_worker(
                     "[Speculative Decoding] Disabling MQA scorer as the "
                     "MQA is only available with flash attn backend.")
 
-            if "model_config" in draft_worker_kwargs and \
-                draft_worker_kwargs["model_config"].max_model_len < \
+            if draft_model_config and \
+                draft_model_config.max_model_len < \
                     scorer_worker.model_config.max_model_len:
                 disable_mqa_scorer = True
                 logger.info(
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index 2bb7af7d7c600..e61cde5b17f20 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,8 +1,6 @@
 from typing import List, Optional
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.sequence import SequenceGroupMetadata
 from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
                                       ModelRunner)
@@ -20,35 +18,21 @@ class TargetModelRunner(ModelRunner):
     requested or not. 
     """
 
-    def __init__(self,
-                 model_config: ModelConfig,
-                 parallel_config: ParallelConfig,
-                 scheduler_config: SchedulerConfig,
-                 device_config: DeviceConfig,
-                 cache_config: CacheConfig,
-                 load_config: LoadConfig,
-                 lora_config: Optional[LoRAConfig],
-                 kv_cache_dtype: Optional[str] = "auto",
-                 is_driver_worker: bool = False,
-                 prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-                 return_hidden_states: bool = False,
-                 observability_config: Optional[ObservabilityConfig] = None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        return_hidden_states: bool = False,
+    ):
         # An internal boolean member variable to indicate if token log
         # probabilities are needed or not.
         self.disable_logprobs = True
         super().__init__(
-            model_config=model_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            lora_config=lora_config,
+            vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
-            prompt_adapter_config=prompt_adapter_config,
             return_hidden_states=return_hidden_states,
-            observability_config=observability_config,
         )
 
     def prepare_model_input(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index febabd2f31036..64cc18149d6c5 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,8 +2,9 @@
 from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type,
                     Union)
 
-from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
@@ -32,7 +33,7 @@ class LLMEngine:
 
     def __init__(
         self,
-        vllm_config: EngineConfig,
+        vllm_config: VllmConfig,
         executor_class: Type[GPUExecutor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -477,7 +478,7 @@ def get_lora_config(self) -> LoRAConfig:
         return self.lora_config
 
     @classmethod
-    def _get_executor_cls(cls, engine_config: EngineConfig):
+    def _get_executor_cls(cls, engine_config: VllmConfig):
         return GPUExecutor
 
     def is_tracing_enabled(self) -> bool:
diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/gpu_executor.py
index c780c7031c3d6..b12c500f1f9ee 100644
--- a/vllm/v1/executor/gpu_executor.py
+++ b/vllm/v1/executor/gpu_executor.py
@@ -56,19 +56,10 @@ def _create_worker(
             distributed_init_method = get_distributed_init_method(
                 get_ip(), get_open_port())
         return Worker(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            speculative_config=self.speculative_config,
-            prompt_adapter_config=self.prompt_adapter_config,
-            observability_config=self.observability_config,
         )
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e84645ac7a4ae..77c1e10ab6bdf 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -7,9 +7,7 @@
 import torch.distributed
 import torch.nn as nn
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
@@ -33,26 +31,25 @@ class GPUModelRunner:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
+        vllm_config: VllmConfig,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
-
+        # TODO: use ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
+        model_config = self.model_config
+        cache_config = self.cache_config
+        scheduler_config = self.scheduler_config
+        parallel_config = self.parallel_config
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 8c5ca2ec35666..c8192b7f86eb0 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -6,10 +6,7 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
@@ -30,48 +27,35 @@ class Worker:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        speculative_config: Optional[SpeculativeConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+
+        # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.speculative_config = speculative_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner = GPUModelRunner(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config,
-            lora_config=lora_config,
-        )
+        self.model_runner = GPUModelRunner(vllm_config)
 
     def initialize(self):
         if self.device_config.device.type == "cuda":
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 0c6fcdf03ba9e..a98faa2f2d0cb 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -8,9 +8,7 @@
 from torch import nn
 
 from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, PromptAdapterConfig,
-                         SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -412,29 +410,18 @@ class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         *args,
         **kwargs,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
+        ModelRunnerBase.__init__(self, vllm_config)
         # Currently, CPU worker doesn't support chunked prefill.
         assert self.scheduler_config.chunked_prefill_enabled is False
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.load_config = load_config
+        model_config = self.model_config
+        cache_config = self.cache_config
+
         self.is_driver_worker = is_driver_worker
 
         self.device = self.device_config.device
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index ab93471b5af74..3778707ae07e8 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -6,9 +6,8 @@
 
 import vllm.envs as envs
 from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, PromptAdapterConfig,
-                         SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, VllmConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -18,7 +17,8 @@
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerInput)
+                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -121,31 +121,19 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
         kv_cache_dtype: Optional[str] = "auto",
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.prompt_adapter_config = prompt_adapter_config
+
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -166,15 +154,8 @@ def __init__(
         if self._is_encoder_decoder_model():
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunner = ModelRunnerClass(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=self.load_config,
-            lora_config=self.lora_config,
+            vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
-            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index a7f5b2d4fdd1f..ff288d5ca1512 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -3,9 +3,7 @@
 
 import torch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
@@ -36,29 +34,13 @@ class EmbeddingModelRunner(
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
     ):
-        super().__init__(model_config,
-                         parallel_config,
-                         scheduler_config,
-                         device_config,
-                         cache_config,
-                         load_config,
-                         lora_config=lora_config,
+        super().__init__(vllm_config=vllm_config,
                          kv_cache_dtype=kv_cache_dtype,
-                         is_driver_worker=is_driver_worker,
-                         prompt_adapter_config=prompt_adapter_config,
-                         observability_config=observability_config)
+                         is_driver_worker=is_driver_worker)
 
     @torch.inference_mode()
     def execute_model(
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 2ea314f8608ee..90a43196084ea 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -11,9 +11,7 @@
 from vllm.attention.selector import (_Backend, get_env_variable_attn_backend,
                                      get_global_forced_attn_backend,
                                      global_force_attn_backend)
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import ModelConfig, VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -85,17 +83,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
@@ -107,15 +97,10 @@ def __init__(
         models) but these arguments are present here for compatibility with 
         the base-class constructor.
         '''
-        self._maybe_force_supported_attention_backend(model_config)
+        self._maybe_force_supported_attention_backend(vllm_config.model_config)
+
         super().__init__(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config,
-            lora_config=None,
+            vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
         )
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index f2123c64c3274..0e200e6abb05e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -20,9 +20,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.levels import CompilationLevel
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
@@ -955,32 +953,20 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         return_hidden_states: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
+
+        ModelRunnerBase.__init__(self, vllm_config)
+        model_config = self.model_config
+        cache_config = self.cache_config
+
         self.is_driver_worker = is_driver_worker
-        self.prompt_adapter_config = prompt_adapter_config
         self.return_hidden_states = return_hidden_states
-        self.observability_config = observability_config
 
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 89d7addb5a8d9..9e529f86b46bb 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -9,6 +9,7 @@
 import torch
 from torch import is_tensor
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.platforms import current_platform
@@ -220,6 +221,22 @@ class ModelRunnerBase(ABC, Generic[T]):
     ModelRunnerInputBase subclass.
     """
 
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
     # Map of request_id -> generator used for seeded random sampling
     generators: Dict[str, torch.Generator] = {}
 
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index be2f0d79154d6..3ee0fb4dc943e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -304,6 +304,7 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
     # mypy: enable-error-code=type-var
 
     def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
+
         super().__init__(*args, **kwargs)
 
         # Check attention backend support.
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index bf66f32d7d244..1f982fe103366 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -27,17 +27,9 @@ def __init__(self, *args, **kwargs):
         # for multi-step model, wrap the model runner with MultiStepModelRunner
         self.model_runner = MultiStepModelRunner(
             base_model_runner,
-            base_model_runner.model_config,
-            base_model_runner.parallel_config,
-            base_model_runner.scheduler_config,
-            base_model_runner.device_config,
-            base_model_runner.cache_config,
-            load_config=base_model_runner.load_config,
-            lora_config=self.lora_config,
+            vllm_config=base_model_runner.vllm_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=base_model_runner.is_driver_worker,
-            prompt_adapter_config=base_model_runner.prompt_adapter_config,
-            observability_config=base_model_runner.observability_config,
         )
 
         pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index b8c760c4b5396..2da22cbfc7cb5 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -7,8 +7,7 @@
 from torch import nn
 from transformers_neuronx.config import GenerationConfig
 
-from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -57,20 +56,13 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
+        vllm_config: VllmConfig,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-
+        ModelRunnerBase.__init__(self, vllm_config)
+        model_config = self.model_config
         if model_config is not None and model_config.get_sliding_window():
             logger.warning("Sliding window is not supported on Neuron. "
                            "The model will run without sliding window.")
-        self.device_config = (device_config
-                              if device_config is not None else DeviceConfig())
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index fff14d6402b44..3f6269684ac93 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -4,15 +4,15 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerInput)
+                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 
 class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
@@ -21,20 +21,12 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
+        WorkerBase.__init__(self, vllm_config=vllm_config)
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
@@ -44,7 +36,7 @@ def __init__(
             init_cached_hf_modules()
 
         self.model_runner: NeuronModelRunner = NeuronModelRunner(
-            model_config, parallel_config, scheduler_config, device_config)
+            vllm_config=vllm_config)
         self.is_driver_worker = True
 
     def init_device(self) -> None:
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 3da738636a59d..c9c87ea748081 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -7,9 +7,7 @@
 
 from vllm.attention import get_attn_backend
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -17,6 +15,7 @@
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs, MultiModalPlaceholderMap)
 from vllm.sequence import SequenceGroupMetadata
+from vllm.worker.model_runner_base import ModelRunnerBase
 
 logger = init_logger(__name__)
 
@@ -39,33 +38,21 @@ def empty(cls, device):
                           multi_modal_kwargs={})
 
 
-class OpenVINOModelRunner:
+class OpenVINOModelRunner(ModelRunnerBase):
 
     def __init__(
         self,
         ov_core: ov.Core,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        multimodal_config: Optional[MultiModalConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         *args,
         **kwargs,
     ):
         self.ov_core = ov_core
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.multimodal_config = multimodal_config
-        self.load_config = load_config
+        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        cache_config = self.cache_config
+        model_config = self.model_config
         self.is_driver_worker = is_driver_worker
 
         self.device = self.device_config.device
@@ -369,3 +356,9 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
         return output
+
+    def prepare_model_input(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def make_model_input_from_broadcasted_tensor_dict(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index a420d390c1ae4..205f8a337ce6c 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -7,9 +7,8 @@
 
 import vllm.envs as envs
 from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, VllmConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -22,7 +21,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
 from vllm.worker.openvino_model_runner import OpenVINOModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
 logger = init_logger(__name__)
 
@@ -212,33 +211,19 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
     def __init__(
         self,
         ov_core: ov.Core,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        multimodal_config: Optional[MultiModalConfig] = None,
         kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
         is_driver_worker: bool = False,
     ) -> None:
         self.ov_core = ov_core
-        self.model_config = model_config
-        self.parallel_config = parallel_config
+        WorkerBase.__init__(self, vllm_config)
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -250,14 +235,7 @@ def __init__(
             init_cached_hf_modules()
         self.model_runner = OpenVINOModelRunner(
             self.ov_core,
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=self.load_config,
-            lora_config=self.lora_config,
-            multimodal_config=self.multimodal_config,
+            vllm_config=self.vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
         )
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 3792cbc0f730f..7d9d669a45ce3 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -12,8 +12,7 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
@@ -90,20 +89,10 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         is_driver_worker: bool = False,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
         self.is_driver_worker = is_driver_worker
 
         self.block_size = self.cache_config.block_size
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index de6f7ab0072fd..096cb23416909 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -6,8 +6,7 @@
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -16,7 +15,8 @@
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
 from vllm.worker.tpu_model_runner import TPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerInput)
+                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -25,24 +25,14 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
         is_driver_worker: bool,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
+        WorkerBase.__init__(self, vllm_config=vllm_config)
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
@@ -56,13 +46,7 @@ def __init__(
                 self.cache_config.cache_dtype]
 
         self.model_runner: TPUModelRunner = TPUModelRunner(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config,
-            is_driver_worker=is_driver_worker)
+            vllm_config=vllm_config, is_driver_worker=is_driver_worker)
 
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index fd30962e5d6bb..8928936b4f9fc 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -7,10 +7,7 @@
 import torch.distributed
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
@@ -27,7 +24,8 @@
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
-from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -42,46 +40,31 @@ class Worker(LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
+        WorkerBase.__init__(self, vllm_config)
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
-        if parallel_config and is_driver_worker:
-            assert rank % parallel_config.tensor_parallel_size == 0, \
+        if is_driver_worker:
+            assert rank % self.parallel_config.tensor_parallel_size == 0, \
                    "Driver worker should be rank 0 of tensor parallel group."
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.observability_config = observability_config
 
         # Return hidden states from target model if the draft model is an
         # mlp_speculator
+        speculative_config = self.speculative_config
+        model_config = self.model_config
         speculative_args = {} if speculative_config is None \
             or (speculative_config.draft_model_config.model ==
                 model_config.model) \
@@ -97,17 +80,9 @@ def __init__(
         elif self._is_encoder_decoder_model():
             ModelRunnerClass = EncoderDecoderModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=load_config,
-            lora_config=self.lora_config,
+            vllm_config=self.vllm_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            prompt_adapter_config=prompt_adapter_config,
-            observability_config=observability_config,
             **speculative_args,
         )
         # Uninitialized cache engine. Will be initialized by
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 6ba4f272315ce..cf8a4946a71c4 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from vllm.config import ObservabilityConfig
+from vllm.config import ObservabilityConfig, VllmConfig
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -29,6 +29,22 @@ class WorkerBase(ABC):
     communicate request metadata to other workers.
     """
 
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
     @abstractmethod
     def init_device(self) -> None:
         """Initialize device state, such as loading the model or other on-device
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 739fe1b3d2c4f..f37d70bee76ed 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -10,9 +10,7 @@
 import torch.nn as nn
 
 from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -363,33 +361,18 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         return_hidden_states: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
+
+        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        model_config = self.model_config
+        cache_config = self.cache_config
         self.is_driver_worker = is_driver_worker
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
-        if self.observability_config is not None:
-            print(f"observability_config is {self.observability_config}")
         self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index c1d836bb0d318..1295666055b04 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -8,10 +8,7 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -19,7 +16,7 @@
 from vllm.platforms import current_platform
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 from vllm.worker.xpu_model_runner import XPUModelRunner
 
 logger = init_logger(__name__)
@@ -36,53 +33,32 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
     ) -> None:
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+        device_config = self.device_config
+        parallel_config = self.parallel_config
         assert device_config.device_type == "xpu"
         assert current_platform.is_xpu()
 
-        self.model_config = model_config
-        self.parallel_config = parallel_config
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
-        self.observability_config = observability_config
         if parallel_config and is_driver_worker:
             assert rank % parallel_config.tensor_parallel_size == 0, \
                    "Driver worker should be rank 0 of tensor parallel group."
 
         self.model_runner = XPUModelRunner(  # type: ignore
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=self.load_config,
-            lora_config=self.lora_config,
+            vllm_config=vllm_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            observability_config=self.observability_config,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.

From d6459b4516dbac4f346ce29fe90d43ebfafa1114 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 2 Nov 2024 10:44:38 -0400
Subject: [PATCH 25/43] [V1] Fix `EngineArgs` refactor on V1 (#9954)

---
 vllm/v1/executor/gpu_executor.py | 39 ++++++++++----------------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/gpu_executor.py
index b12c500f1f9ee..de56332240192 100644
--- a/vllm/v1/executor/gpu_executor.py
+++ b/vllm/v1/executor/gpu_executor.py
@@ -1,10 +1,7 @@
 import os
 from typing import Optional, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import EngineConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.outputs import ModelRunnerOutput
@@ -15,29 +12,17 @@
 
 class GPUExecutor:
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
-        observability_config: Optional[ObservabilityConfig],
-    ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.speculative_config = speculative_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
+    def __init__(self, vllm_config: EngineConfig) -> None:
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
 
         self.worker = self._create_worker()
         self.worker.initialize()

From 74b529ceeead8d4b44ded858f7c28bca9c1629ba Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 08:03:33 -0700
Subject: [PATCH 26/43] [bugfix] fix chatglm dummy_data_for_glmv (#9955)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/models/chatglm.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index ca90d10e9f9fb..c3c9ec703c1e6 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -14,8 +14,8 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -31,8 +31,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
-                             MultiModalInputs)
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
 from vllm.multimodal.base import MultiModalData
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -117,16 +116,15 @@ def get_max_glmv_image_tokens(ctx: InputContext):
     raise NotImplementedError(msg)
 
 
-def dummy_data_for_glmv(
-    ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]
-) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
+def dummy_data_for_glmv(ctx: InputContext, seq_len: int,
+                        mm_counts: Mapping[str, int]) -> DummyData:
     hf_config = ctx.get_hf_config(ChatGLMConfig)
     vision_config = getattr(hf_config, 'vision_config', None)
 
     if vision_config is None:
         token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len)
         seq_data = SequenceData(token_ids)
-        return seq_data, None
+        return DummyData(seq_data, None)
     elif isinstance(vision_config, dict):
         image_size = vision_config["image_size"]
         image_placeholder_length = calculate_image_placeholder(vision_config)
@@ -141,7 +139,7 @@ def dummy_data_for_glmv(
             "image": Image.new("RGB", (image_size, image_size), color=0)
         }
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)

From cea808f32549973cc19204355c950ad005eeed87 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 12:08:49 -0700
Subject: [PATCH 27/43] [3/N] model runner pass the whole config to model
 (#9958)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/lora/conftest.py                       |   9 +-
 vllm/model_executor/model_loader/__init__.py |  20 +--
 vllm/model_executor/model_loader/loader.py   | 132 ++++++++-----------
 vllm/plugins/__init__.py                     |  22 +++-
 vllm/v1/worker/gpu_model_runner.py           |   8 +-
 vllm/worker/cpu_model_runner.py              |   8 +-
 vllm/worker/model_runner.py                  |   8 +-
 vllm/worker/tpu_model_runner.py              |  10 +-
 vllm/worker/xpu_model_runner.py              |  10 +-
 9 files changed, 87 insertions(+), 140 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index e40f0dd74602e..816d3986fe333 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -248,11 +248,10 @@ def llama_2_7b_engine_extra_embeddings():
     cleanup_dist_env_and_memory(shutdown_ray=True)
     get_model_old = get_model
 
-    def get_model_patched(*, model_config, device_config, **kwargs):
-        kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
-        return get_model_old(model_config=model_config,
-                             device_config=device_config,
-                             **kwargs)
+    def get_model_patched(**kwargs):
+        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
+                                                       max_lora_rank=8)
+        return get_model_old(**kwargs)
 
     with patch("vllm.worker.model_runner.get_model", get_model_patched):
         engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index d1ec171c9ec2a..12468997e4653 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,27 +1,15 @@
-from typing import Optional
-
 from torch import nn
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.loader import (BaseModelLoader,
                                                      get_model_loader)
 from vllm.model_executor.model_loader.utils import (
     get_architecture_class_name, get_model_architecture)
 
 
-def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
-              device_config: DeviceConfig, parallel_config: ParallelConfig,
-              scheduler_config: SchedulerConfig,
-              lora_config: Optional[LoRAConfig],
-              cache_config: CacheConfig) -> nn.Module:
-    loader = get_model_loader(load_config)
-    return loader.load_model(model_config=model_config,
-                             device_config=device_config,
-                             lora_config=lora_config,
-                             parallel_config=parallel_config,
-                             scheduler_config=scheduler_config,
-                             cache_config=cache_config)
+def get_model(*, vllm_config: VllmConfig) -> nn.Module:
+    loader = get_model_loader(vllm_config.load_config)
+    return loader.load_model(vllm_config=vllm_config)
 
 
 __all__ = [
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 79703bb7ded7a..2cb9e0ca7c505 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -21,9 +21,9 @@
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, MultiModalConfig,
-                         ParallelConfig, PoolerConfig, SchedulerConfig)
+from vllm.config import (CacheConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PoolerConfig, SchedulerConfig, VllmConfig)
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
@@ -150,6 +150,7 @@ def _get_model_initialization_kwargs(
 
 
 def build_model(model_class: Type[nn.Module],
+                vllm_config: VllmConfig,
                 hf_config: PretrainedConfig,
                 cache_config: Optional[CacheConfig],
                 quant_config: Optional[QuantizationConfig],
@@ -166,23 +167,29 @@ def build_model(model_class: Type[nn.Module],
     if prefix:
         extra_kwargs["prefix"] = prefix
 
+    # TODO: unify all the module initialization code
+    # to only take the `VllmConfig` object as input
+    from vllm.plugins import set_vllm_config
+    set_vllm_config(vllm_config)
+
     return model_class(config=hf_config,
                        cache_config=cache_config,
                        quant_config=quant_config,
                        **extra_kwargs)
 
 
-def _initialize_model(
-        model_config: ModelConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        cache_config: CacheConfig,
-        scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module:
+def _initialize_model(vllm_config: VllmConfig) -> nn.Module:
     """Initialize a model with the given configurations."""
+    model_config = vllm_config.model_config
+    lora_config = vllm_config.lora_config
+    scheduler_config = vllm_config.scheduler_config
+    cache_config = vllm_config.cache_config
+    load_config = vllm_config.load_config
     model_class, _ = get_model_architecture(model_config)
 
     return build_model(
         model_class,
+        vllm_config,
         model_config.hf_config,
         cache_config=cache_config,
         quant_config=_get_quantization_config(model_config, load_config),
@@ -205,12 +212,7 @@ def download_model(self, model_config: ModelConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, *, vllm_config: VllmConfig) -> nn.Module:
         """Load a model with the given configurations."""
         raise NotImplementedError
 
@@ -396,18 +398,14 @@ def download_model(self, model_config: ModelConfig) -> None:
                               model_config.revision,
                               fall_back_to_pt=True)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config,
-                                          scheduler_config)
+                model = _initialize_model(vllm_config=vllm_config)
 
             model.load_weights(self._get_all_weights(model_config, model))
 
@@ -436,17 +434,12 @@ def __init__(self, load_config: LoadConfig):
     def download_model(self, model_config: ModelConfig) -> None:
         pass  # Nothing to download
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config,
-                                          scheduler_config)
+                model = _initialize_model(vllm_config=vllm_config)
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
@@ -488,10 +481,7 @@ def _get_weights_iterator(
 
     def _load_model_serialized_cpu(
         self,
-        model_config: ModelConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        cache_config: CacheConfig,
+        vllm_config: VllmConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer to the CPU.
 
@@ -500,26 +490,30 @@ def _load_model_serialized_cpu(
         default HuggingFace loading, but will be slower than loading a
         vLLM-tensorized model.
         """
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
 
             model.load_weights(self._get_weights_iterator())
         return model.eval()
 
     def _load_model_serialized(
         self,
-        model_config: ModelConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        cache_config: CacheConfig,
+        vllm_config: VllmConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer.
 
         Expects a vLLM-tensorized model. See the
         examples/tensorize_vllm_model.py example script
         for serializing vLLM models."""
+
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+        lora_config = vllm_config.lora_config
+        cache_config = vllm_config.cache_config
+
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model_class = get_model_architecture(model_config)[0]
@@ -544,12 +538,9 @@ def download_model(self, model_config: ModelConfig) -> None:
         with self.tensorizer_config.open_stream():
             pass
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
         self._verify_config(model_config, parallel_config)
 
         if parallel_config.tensor_parallel_size > 1:
@@ -559,10 +550,8 @@ def load_model(self, *, model_config: ModelConfig,
                     % get_tensor_model_parallel_rank()
 
         if is_vllm_tensorized(self.tensorizer_config):
-            return self._load_model_serialized(model_config, device_config,
-                                               lora_config, cache_config)
-        return self._load_model_serialized_cpu(model_config, device_config,
-                                               lora_config, cache_config)
+            return self._load_model_serialized(vllm_config=vllm_config)
+        return self._load_model_serialized_cpu(vllm_config=vllm_config)
 
     @staticmethod
     def save_model(
@@ -648,12 +637,9 @@ def _prepare_weights(self, model_name_or_path: str,
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         from safetensors.torch import safe_open
 
         from vllm.distributed import get_tensor_model_parallel_rank
@@ -663,8 +649,7 @@ def load_model(self, *, model_config: ModelConfig,
 
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
                 for _, module in model.named_modules():
                     quant_method = getattr(module, "quant_method", None)
                     if quant_method is not None:
@@ -1157,16 +1142,12 @@ def _load_weights(self, model_config: ModelConfig,
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
 
                 self._load_weights(model_config, model)
 
@@ -1235,13 +1216,9 @@ def _get_weights_iterator(
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
-
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         # we can only know if tie word embeddings after mapping weights
@@ -1251,8 +1228,7 @@ def load_model(self, *, model_config: ModelConfig,
 
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
             model.load_weights(
                 self._get_weights_iterator(local_model_path, gguf_weights_map))
         return model
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 4338cbc37f6c1..3336569f59467 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,8 +1,14 @@
 import logging
-from typing import Callable, Optional, Union
+from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import vllm.envs as envs
-from vllm.compilation.config import CompilationConfig
+
+if TYPE_CHECKING:
+    from vllm.compilation.config import CompilationConfig
+    from vllm.config import VllmConfig
+else:
+    CompilationConfig = None
+    VllmConfig = None
 
 logger = logging.getLogger(__name__)
 
@@ -55,3 +61,15 @@ def set_compilation_config(config: Optional[CompilationConfig]):
 
 def get_compilation_config() -> Optional[CompilationConfig]:
     return _compilation_config
+
+
+_vllm_config: Optional[VllmConfig] = None
+
+
+def set_vllm_config(config: Optional[VllmConfig]):
+    global _vllm_config
+    _vllm_config = config
+
+
+def get_vllm_config() -> Optional[VllmConfig]:
+    return _vllm_config
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 77c1e10ab6bdf..2510ea3700d0b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -369,13 +369,7 @@ def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             with patch("vllm.model_executor.layers.sampler.Sampler", Sampler):
-                self.model = get_model(model_config=self.model_config,
-                                       device_config=self.device_config,
-                                       load_config=self.load_config,
-                                       lora_config=self.lora_config,
-                                       parallel_config=self.parallel_config,
-                                       scheduler_config=self.scheduler_config,
-                                       cache_config=self.cache_config)
+                self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index a98faa2f2d0cb..fdd72a452f2ad 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -453,13 +453,7 @@ def model_is_mrope(self) -> bool:
         return uses_mrope(self.model_config.hf_config)
 
     def load_model(self) -> None:
-        self.model = get_model(model_config=self.model_config,
-                               load_config=self.load_config,
-                               device_config=self.device_config,
-                               lora_config=self.lora_config,
-                               parallel_config=self.parallel_config,
-                               scheduler_config=self.scheduler_config,
-                               cache_config=self.cache_config)
+        self.model = get_model(vllm_config=self.vllm_config)
 
     def make_model_input_from_broadcasted_tensor_dict(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0e200e6abb05e..328dab598f8ef 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1051,13 +1051,7 @@ def __init__(
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:
-            self.model = get_model(model_config=self.model_config,
-                                   device_config=self.device_config,
-                                   load_config=self.load_config,
-                                   lora_config=self.lora_config,
-                                   parallel_config=self.parallel_config,
-                                   scheduler_config=self.scheduler_config,
-                                   cache_config=self.cache_config)
+            self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 7d9d669a45ce3..a721186137328 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -137,15 +137,7 @@ def load_model(self) -> None:
                 "vllm.model_executor.layers.vocab_parallel_embedding."
                 "get_tensor_model_parallel_rank",
                 return_value=xm_tp_rank):
-            model = get_model(
-                model_config=self.model_config,
-                load_config=self.load_config,
-                device_config=self.device_config,
-                parallel_config=self.parallel_config,
-                cache_config=self.cache_config,
-                scheduler_config=self.scheduler_config,
-                lora_config=None,
-            )
+            model = get_model(vllm_config=self.vllm_config)
         model = model.eval()
         xm.wait_device_ops()
         self.model = ModelWrapper(model)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index f37d70bee76ed..bae8b469767b2 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -405,15 +405,7 @@ def __init__(
 
     def load_model(self) -> None:
         with DeviceMemoryProfiler() as m:
-            self.model = get_model(
-                model_config=self.model_config,
-                device_config=self.device_config,
-                load_config=self.load_config,
-                lora_config=self.lora_config,
-                parallel_config=self.parallel_config,
-                scheduler_config=self.scheduler_config,
-                cache_config=self.cache_config,
-            )
+            self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",

From 1b73ab2a1f0761a60b28aabe0456a5735de027c5 Mon Sep 17 00:00:00 2001
From: Nikita Furin <nokados@yandex.ru>
Date: Sat, 2 Nov 2024 22:50:28 +0300
Subject: [PATCH 28/43] [CI/Build] Quoting around > (#9956)

---
 Dockerfile         | 2 +-
 Dockerfile.neuron  | 2 +-
 Dockerfile.ppc64le | 2 +-
 Dockerfile.rocm    | 2 +-
 Dockerfile.tpu     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0a562253c537b..343364da2ebf5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -206,7 +206,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 0d0d8df94578c..2143315d2a078 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index cd5fcf481f07c..b19c6ddec7948 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
     pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 562117a313020..8fb79afaebe97 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -52,7 +52,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --pre \
                 torch==2.6.0.dev20240918 \
-                setuptools-scm>=8 \
+                'setuptools-scm>=8' \
                 torchvision==0.20.0.dev20240918 \
                 --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
         *) ;; esac
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index dd8f9ad4714a9..b43442e4c0af1 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -25,7 +25,7 @@ ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 

From ae5279a16385e15c07ab2bcadcbcab44367595e9 Mon Sep 17 00:00:00 2001
From: Yongzao <532741407@qq.com>
Date: Sun, 3 Nov 2024 03:56:05 +0800
Subject: [PATCH 29/43] [torch.compile] Adding torch compile to vision-language
 models (#9946)

---
 vllm/model_executor/models/llava_next.py | 10 +++++++---
 vllm/model_executor/models/minicpmv.py   |  7 ++++++-
 vllm/model_executor/models/molmo.py      | 12 ++++++++----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e8c5786066170..7a2c95594ddcd 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -606,7 +606,6 @@ def forward(
             :class:`LlavaNextImageInputs`
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
             image_input = self._parse_and_validate_image_input(**kwargs)
@@ -618,9 +617,14 @@ def forward(
                     self.language_model.model.get_input_embeddings,
                     lambda _: self._process_image_input(image_input),
                 )
-                input_ids = None
             else:
-                inputs_embeds = None
+                inputs_embeds = self.language_model.model.get_input_embeddings(
+                    input_ids)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a526a5dccd398..e7088edb97b2b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -564,8 +564,13 @@ def forward(
 
             vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
 
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
+
         output = self.llm(
-            input_ids=None,
+            input_ids=input_ids,
             positions=positions,
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 3c34227767e05..ba798833e26a9 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -15,6 +15,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.attention.selector import _Backend
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -713,6 +714,7 @@ def forward(
         return image_features
 
 
+@support_torch_compile
 class MolmoModel(nn.Module):
 
     def __init__(
@@ -1141,7 +1143,6 @@ def forward(
         **kwargs: object,
     ) -> SamplerOutput:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
             image_input = self._parse_and_validate_image_input(**kwargs)
@@ -1156,10 +1157,13 @@ def forward(
                     image_input["image_input_idx"],
                     image_input["seq_len"],
                 )
-
-                input_ids = None
             else:
-                inputs_embeds = None
+                inputs_embeds = self.model.embed_tokens(input_ids)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.model(
             input_ids=input_ids,

From 3bb4befea7166850bdee3f72fe060c9c4044ba85 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 15:54:05 -0700
Subject: [PATCH 30/43] [bugfix] fix tsts (#9959)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 2 +-
 vllm/model_executor/models/utils.py        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 2cb9e0ca7c505..2cf4e92908353 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -150,7 +150,7 @@ def _get_model_initialization_kwargs(
 
 
 def build_model(model_class: Type[nn.Module],
-                vllm_config: VllmConfig,
+                vllm_config: Optional[VllmConfig],
                 hf_config: PretrainedConfig,
                 cache_config: Optional[CacheConfig],
                 quant_config: Optional[QuantizationConfig],
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index c6ec1769fc5d1..fee97e8922a76 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -252,6 +252,7 @@ def init_vllm_registered_model(
 
     return build_model(
         model_class,
+        None,
         hf_config,
         cache_config,
         quant_config,

From 1f1b6d6eda3ea5fbdf4566632ac8a9fa61b31593 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sun, 3 Nov 2024 17:14:17 +0000
Subject: [PATCH 31/43] [V1] Support per-request seed (#9945)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
---
 vllm/v1/sample/metadata.py         |  5 +--
 vllm/v1/sample/sampler.py          | 23 +++++------
 vllm/v1/worker/gpu_model_runner.py | 61 ++++++++++++++----------------
 3 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 28614377b27b9..9ef36f2e6b212 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Dict
 
 import torch
 
@@ -16,7 +16,6 @@ class SamplingMetadata:
     no_top_p: bool
     no_top_k: bool
 
-    generators: List[Optional[torch.Generator]]
-    no_generator: bool
+    generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 157c4dd6d771e..927f274541c4d 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,5 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import List, Optional
+from typing import Dict
 
 import torch
 import torch.nn as nn
@@ -84,22 +84,21 @@ def greedy_sample(self, probs: torch.Tensor) -> torch.Tensor:
     def random_sample(
         self,
         probs: torch.Tensor,
-        generators: List[Optional[torch.Generator]],
-        no_generator: bool,
+        generators: Dict[int, torch.Generator],
     ) -> torch.Tensor:
         q = torch.empty_like(probs)
         # NOTE(woosuk): To batch-process the requests without their own seeds,
         # which is the common case, we first assume that every request does
         # not have its own seed. Then, we overwrite the values for the requests
         # that have their own seeds.
-        q.exponential_()
-        if not no_generator:
-            assert len(generators) == probs.shape[0]
+        if len(generators) != probs.shape[0]:
+            # This might still be done here unnecessarily if there are greedies
+            q.exponential_()
+        if generators:
             # TODO(woosuk): This can be slow because we handle each request
             # one by one. Optimize this.
-            for i, generator in enumerate(generators):
-                if generator is not None:
-                    q[i].exponential_(generator=generator)
+            for i, generator in generators.items():
+                q[i].exponential_(generator=generator)
         return probs.div_(q).argmax(dim=-1).view(-1)
 
     def sample(
@@ -112,13 +111,11 @@ def sample(
         if sampling_metadata.all_greedy:
             return self.greedy_sample(probs)
         if sampling_metadata.all_random:
-            return self.random_sample(probs, sampling_metadata.generators,
-                                      sampling_metadata.no_generator)
+            return self.random_sample(probs, sampling_metadata.generators)
 
         greedy_sampled = self.greedy_sample(probs)
         random_sampled = self.random_sample(probs,
-                                            sampling_metadata.generators,
-                                            sampling_metadata.no_generator)
+                                            sampling_metadata.generators)
         sampled = torch.where(
             sampling_metadata.temperature < _SAMPLING_EPS,
             greedy_sampled,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2510ea3700d0b..ae4239f8e1fab 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -128,13 +128,20 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Add new requests to the cached states.
         for req_data in scheduler_output.scheduled_new_reqs:
             req_id = req_data.req_id
+            sampling_params = req_data.sampling_params
+            if sampling_params.seed is not None:
+                generator = torch.Generator(device=self.device)
+                generator.manual_seed(sampling_params.seed)
+            else:
+                generator = None
+
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=req_data.prompt_token_ids,
                 prompt=req_data.prompt,
                 multi_modal_data=req_data.multi_modal_data,
-                sampling_params=req_data.sampling_params,
-                generator=None,  # TODO
+                sampling_params=sampling_params,
+                generator=generator,
                 block_ids=req_data.block_ids,
                 num_computed_tokens=req_data.num_computed_tokens,
                 output_token_ids=[],
@@ -342,11 +349,9 @@ def execute_model(
             else:
                 # Ignore the sampled token from the partial request.
                 # Rewind the generator state as if the token was not sampled.
-                generator = self.input_batch.generators[i]
+                generator = self.input_batch.generators.get(i)
                 if generator is not None:
-                    offset = generator.get_offset()
-                    generator = generator.set_offset(offset - 1)
-                    self.input_batch.generators[i] = generator
+                    generator.set_offset(generator.get_offset() - 1)
 
         if sampler_output.logprob_token_ids is None:
             logprob_token_ids = None
@@ -494,8 +499,8 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
-        self.generators: List[Optional[torch.Generator]] = [None
-                                                            ] * max_num_reqs
+        # req_index -> generator
+        self.generators: Dict[int, torch.Generator] = {}
 
         self.num_logprobs: Dict[str, int] = {}
         self.prompt_logprob_reqs: Set[str] = set()
@@ -509,8 +514,9 @@ def add_request(
             req_index = self.num_reqs
         assert req_index < self.max_num_reqs
 
-        self.req_ids[req_index] = request.req_id
-        self.req_id_to_index[request.req_id] = req_index
+        req_id = request.req_id
+        self.req_ids[req_index] = req_id
+        self.req_id_to_index[req_id] = req_index
 
         # Copy the prompt token ids and output token ids.
         num_prompt_tokens = len(request.prompt_token_ids)
@@ -528,27 +534,24 @@ def add_request(
         sampling_params = request.sampling_params
         self.temperature_cpu[req_index] = sampling_params.temperature
         if sampling_params.sampling_type == SamplingType.GREEDY:
-            self.greedy_reqs.add(req_index)
-        elif sampling_params.sampling_type == SamplingType.RANDOM:
-            self.random_reqs.add(req_index)
-        elif sampling_params.sampling_type == SamplingType.RANDOM_SEED:
-            # TODO(woosuk): Support per-request random seed.
-            raise NotImplementedError("Per-request seed is not supported yet.")
+            self.greedy_reqs.add(req_id)
+        else:
+            self.random_reqs.add(req_id)
 
         self.top_p_cpu[req_index] = sampling_params.top_p
         if sampling_params.top_p < 1:
-            self.top_p_reqs.add(req_index)
+            self.top_p_reqs.add(req_id)
         self.top_k_cpu[req_index] = sampling_params.top_k
         if sampling_params.top_k > 0:
-            self.top_k_reqs.add(req_index)
+            self.top_k_reqs.add(req_id)
 
         self.generators[req_index] = request.generator
 
         num_logprobs = sampling_params.logprobs
         if num_logprobs is not None and num_logprobs > 0:
-            self.num_logprobs[request.req_id] = num_logprobs
+            self.num_logprobs[req_id] = num_logprobs
         if sampling_params.prompt_logprobs:
-            self.prompt_logprob_reqs.add(req_index)
+            self.prompt_logprob_reqs.add(req_id)
 
     def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
@@ -560,7 +563,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
-        self.generators[req_index] = None
+        self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
         return req_index
@@ -612,7 +615,9 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            self.generators[empty_index] = self.generators[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
 
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
@@ -636,8 +641,7 @@ def make_sampling_metadata(
             top_k=self.top_k[:self.num_reqs],
             no_top_p=self.no_top_p,
             no_top_k=self.no_top_k,
-            generators=self.generators[:self.num_reqs],
-            no_generator=self.no_generator,
+            generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
         )
 
@@ -661,16 +665,9 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
-    @property
-    def no_generator(self) -> bool:
-        return len(self.generators) == 0
-
     @property
     def max_num_logprobs(self) -> int:
-        if self.num_logprobs:
-            return max(self.num_logprobs.values())
-        else:
-            return 0
+        return max(self.num_logprobs.values()) if self.num_logprobs else 0
 
     @property
     def no_logprob(self) -> bool:

From d40eb8960fcac8f8fa2e82975f679d6642850536 Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Fri, 25 Oct 2024 16:19:30 +0000
Subject: [PATCH 32/43] initial commit for h2ovl-mississippi models

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 docs/source/models/supported_models.rst     |   6 +
 vllm/entrypoints/chat_utils.py              |   2 +-
 vllm/model_executor/models/h2ovl.py         | 350 ++++++++++++++++++++
 vllm/model_executor/models/registry.py      |   1 +
 vllm/transformers_utils/config.py           |   3 +-
 vllm/transformers_utils/configs/__init__.py |   1 +
 vllm/transformers_utils/configs/h2ovl.py    |  12 +
 7 files changed, 373 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/models/h2ovl.py
 create mode 100644 vllm/transformers_utils/configs/h2ovl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index a5c085bb84db9..55835d945b00c 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -440,6 +440,12 @@ Text Generation
     - :code:`THUDM/glm-4v-9b` etc.
     - 
     - ✅︎
+  * - :code:`H2OVLChatModel`
+    - H2OVL
+    - T + I\ :sup:`E+`
+    - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
+    - 
+    - ✅︎
   * - :code:`InternVLChatModel`
     - InternVL2
     - T + I\ :sup:`E+`
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bc2de2d162473..df4de9483c455 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -187,7 +187,7 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "internvl_chat", "NVLM_D"):
+            if model_type in ("chameleon", "internvl_chat", "NVLM_D", "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
new file mode 100644
index 0000000000000..a290fc7b14bcc
--- /dev/null
+++ b/vllm/model_executor/models/h2ovl.py
@@ -0,0 +1,350 @@
+# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
+# --------------------------------------------------------
+# H2OVL
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, List, Mapping
+from functools import partial
+from PIL import Image
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
+                         token_inputs)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.utils import is_list_of
+
+from .intern_vit import InternVisionModel
+from .internvl import (InternVLChatModel, 
+                       InternVLInputPipeline,
+                       build_transform,
+                       find_closest_aspect_ratio,
+                       get_internvl_num_patches,
+                       get_max_internvl_image_size,
+                       IMG_START, IMG_END, IMG_CONTEXT)
+
+
+# Modified to include blocks generated in second pass
+def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
+                         max_num: int, image_size: int,
+                         use_thumbnail: bool,
+                         prior_aspect_ratio=None) -> Tuple[int, int, int, Tuple[int, int]]:
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
+                        for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # If prior_aspect_ratio is provided, filter the target ratios
+    if prior_aspect_ratio is not None:
+        target_ratios = [ratio for ratio in target_ratios if
+                         prior_aspect_ratio[0] % ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0]
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # add thumbnail image if num_blocks > 1
+    if use_thumbnail and blocks > 1:
+        blocks += 1
+    return blocks, target_width, target_height, target_aspect_ratio
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
+                       image_size: int,
+                       use_thumbnail: bool) -> Tuple[List[Image.Image], Tuple[int, int]]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height, target_aspect_ratio = calculate_num_blocks(
+        orig_width,
+        orig_height,
+        min_num,
+        max_num,
+        image_size,
+        use_thumbnail=False)
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
+
+
+# New dynamic_preprocess2 with prior_aspect_ratio
+def dynamic_preprocess2(image: Image.Image, min_num: int, max_num: int,
+                        image_size: int, use_thumbnail: bool, prior_aspect_ratio: Tuple[int, int]) -> List[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks based on prior aspect ratio
+    blocks, target_width, target_height, _ = calculate_num_blocks(
+        orig_width,
+        orig_height,
+        min_num,
+        max_num,
+        image_size,
+        use_thumbnail=False,
+        prior_aspect_ratio=prior_aspect_ratio)
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image1(image:Image.Image, input_size=448, min_num=1, max_num=6):
+    # image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values, target_aspect_ratio
+
+def load_image2(image:Image.Image, input_size=448, min_num=1, max_num=6, target_aspect_ratio=None):
+    # image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def image_to_pixel_values(image:Image.Image,
+                          input_size: int, min_num: int,
+                          max_num: int, use_thumbnail: bool,
+                          use_MSAC: bool) -> torch.Tensor:
+    # When MSAC is turned on, we need to preprocess the image twice
+    if use_MSAC:
+        pixel_values, target_aspect_ratio = load_image1(image, input_size=input_size, min_num=min_num, max_num=max_num)
+        pixel_values2 = load_image2(image, input_size=input_size, min_num=min_num, max_num=max_num, target_aspect_ratio=target_aspect_ratio)
+        pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
+
+    else:
+        transform = build_transform(input_size=input_size)
+        images, _ = dynamic_preprocess(image,
+                                    min_num=min_num,
+                                    max_num=max_num,
+                                    image_size=input_size,
+                                    use_thumbnail=use_thumbnail)
+        pixel_values = [transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)    
+
+    return pixel_values
+
+def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
+                                  max_dynamic_patch: Optional[int] = None):
+    image_size = hf_config.vision_config.image_size
+    min_num = hf_config.min_dynamic_patch
+    if max_dynamic_patch is None:
+        max_dynamic_patch = hf_config.max_dynamic_patch
+    use_thumbnail = hf_config.use_thumbnail
+    use_MSAC = hf_config.use_msac
+    return partial(image_to_pixel_values,
+                   input_size=image_size,
+                   min_num=min_num,
+                   max_num=max_dynamic_patch,
+                   use_thumbnail=use_thumbnail,
+                   use_MSAC=use_MSAC)
+
+
+def get_max_internvl_image_tokens(ctx: InputContext,
+                                  *,
+                                  max_dynamic_patch: Optional[int] = None):
+    hf_config = ctx.get_hf_config()
+    vision_config = hf_config.vision_config
+    
+
+    use_thumbnail = hf_config.use_thumbnail
+    max_dynamic_patch = hf_config.max_dynamic_patch
+    use_MSAC = hf_config.use_msac
+
+    # calculate the actual max_dy
+    print('The max_dynamic_patch is:', max_dynamic_patch)
+
+    image_size = vision_config.image_size
+    num_patches = get_internvl_num_patches(hf_config)
+    # return num_patches * max_dynamic_patch
+   
+    min_num = hf_config.min_dynamic_patch
+    max_num = hf_config.max_dynamic_patch
+    
+    # Assuming we're calculating for a dummy image with maximum size
+    max_image_width, max_image_height = get_max_internvl_image_size(ctx, max_dynamic_patch=max_dynamic_patch)
+    dummy_image = Image.new('RGB', (max_image_width, max_image_height))
+    
+    # Calculate num_blocks based on the dummy image's size
+    num_blocks = image_to_pixel_values(dummy_image,
+                                       image_size,
+                                       min_num,
+                                       max_num,
+                                       use_thumbnail=use_thumbnail,
+                                       use_MSAC=use_MSAC).shape[0]
+    
+    # Return the final token count: num_blocks * num_patches
+    return num_blocks * num_patches
+
+
+class H2OVLInputPipeline(InternVLInputPipeline):
+    def __init__(self):
+        super().__init__(IMG_START, IMG_END, IMG_CONTEXT)
+    def input_processor(
+        self, 
+        ctx: InputContext,
+        inputs: DecoderOnlyInputs,
+        *, 
+        max_dynamic_patch: Optional[int] = None,
+    ) -> DecoderOnlyInputs:
+        multi_modal_data = inputs.get("multi_modal_data")
+        if multi_modal_data is None or "image" not in multi_modal_data:
+            return inputs
+        
+        model_config = ctx.model_config
+        hf_config = ctx.get_hf_config()
+        
+        image_data = multi_modal_data["image"]
+        num_patches = get_internvl_num_patches(hf_config)
+        
+        # can only get the total blocks num after the image fully processed
+        num_blocks_calculator = image_to_pixel_values_wrapper(hf_config, max_dynamic_patch=max_dynamic_patch)
+        
+        if isinstance(image_data, Image.Image):
+            num_blocks = num_blocks_calculator(image_data).shape[0]
+            image_feature_sizes = [num_blocks * num_patches]
+        
+        elif is_list_of(image_data, Image.Image):
+            # Do not use MSAC for multi images
+            hf_config.use_msac = False
+            image_feature_sizes = []
+            for image in image_data:
+                num_blocks = num_blocks_calculator(image).shape[0]
+                image_feature_sizes.append(num_blocks * num_patches)
+                
+        elif isinstance(image_data, torch.Tensor):
+            num_images, image_feature_size, hidden_size = image_data.shape
+            image_feature_sizes = [image_feature_size]
+        else:
+            raise TypeError(f"Invalid image type: {type(image_data)}")
+        
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_config.trust_remote_code)
+        
+        prompt = inputs.get("prompt")
+        prompt_token_ids = inputs["prompt_token_ids"]
+        if prompt is None:
+            prompt = tokenizer.decode(prompt_token_ids)
+            
+        new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
+                                               num_patches)
+        new_prompt_token_ids = tokenizer.encode(new_prompt)
+
+        return token_inputs(prompt=prompt,
+                            prompt_token_ids=new_prompt_token_ids,
+                            multi_modal_data=multi_modal_data)
+        
+    def input_mapper(
+        self, 
+        ctx: InputContext,
+        data: object,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+    ):
+        hf_config = ctx.get_hf_config()
+        
+        image_pixel_values_mapper = image_to_pixel_values_wrapper(
+            hf_config, max_dynamic_patch)
+        
+        if isinstance(data, Image.Image):
+            data = image_pixel_values_mapper(data)
+            data = data.unsqueeze(0)
+        elif is_list_of(data, Image.Image):
+            hf_config.use_msac = False 
+            data = [image_pixel_values_mapper(img) for img in data]
+            
+        else:
+            return MultiModalInputs({"image_embeds": data})
+        model_config = ctx.model_config
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_config.trust_remote_code)
+        image_token_id = tokenizer.encode(self.img_context_token,
+                                          add_special_tokens=False,
+                                          return_tensors="pt")[0]
+
+        return MultiModalInputs({
+            "pixel_values": data,
+            "image_token_id": image_token_id
+        })
+        
+
+input_pipeline = H2OVLInputPipeline()
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
+@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
+class H2OVLChatModel(InternVLChatModel):
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = config.vision_config.num_hidden_layers \
+                    + vision_feature_layer + 1
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            # We added additional dummy heads to the original num of heads to
+            # make the number of heads divisible by 8.
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            msg = "Monolith mode is not applicable to H2OVL"
+            raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f50ceaccb1bbe..6810c9bc94390 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -128,6 +128,7 @@ def add_embedding_models(base_models, embedding_models):
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
+    "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9bd2531d7a15c..33ca5d0b2f639 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -18,7 +18,7 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             EAGLEConfig, ExaoneConfig,
+                                             EAGLEConfig, ExaoneConfig, H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,6 +52,7 @@
     "medusa": MedusaConfig,
     "eagle": EAGLEConfig,
     "exaone": ExaoneConfig,
+    "h2ovl_chat": H2OVLChatConfig,
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index f0d79197a82c5..aedfe925a07cd 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -6,6 +6,7 @@
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
 from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
new file mode 100644
index 0000000000000..7d4941c0120b8
--- /dev/null
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -0,0 +1,12 @@
+# Adapted from
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
+# --------------------------------------------------------
+# H2OVL
+# Copyright (c) 2024 H2O.ai
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .internvl import InternVLChatConfig
+
+class H2OVLChatConfig(InternVLChatConfig):
+    model_type = "h2ovl_chat"
\ No newline at end of file

From 3d6ed4f1bdaab99ddcf84e06ebf2fa47f256a9e8 Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Sun, 27 Oct 2024 01:21:03 +0000
Subject: [PATCH 33/43] added offline examples

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 examples/offline_inference_vision_language.py | 26 ++++++++++
 ...e_inference_vision_language_multi_image.py | 34 ++++++++++++++
 vllm/model_executor/models/h2ovl.py           | 47 ++++++-------------
 3 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 60cdb186331fe..5419115a35b68 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -176,6 +176,31 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# H2OVL-Mississippi
+def run_h2ovl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return llm, prompt, stop_token_ids
+
+
 # InternVL
 def run_internvl(question: str, modality: str):
     assert modality == "image"
@@ -363,6 +388,7 @@ def run_glm4v(question: str, modality: str):
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
+    "h2ovl_chat": run_h2ovl,
     "internvl_chat": run_internvl,
     "NVLM_D": run_nvlm_d,
     "qwen_vl": run_qwen_vl,
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index e28514bf403f7..15695af846935 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -106,6 +106,38 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
         chat_template=None,
     )
 
+def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
 
 def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
@@ -258,6 +290,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
 
 model_example_map = {
     "phi3_v": load_phi3v,
+    "h2ovl_chat": load_h2onvl,
     "internvl_chat": load_internvl,
     "NVLM_D": load_nvlm_d,
     "qwen2_vl": load_qwen2_vl,
@@ -285,6 +318,7 @@ def run_generate(model, question: str, image_urls: List[str]):
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+        
 
 
 def run_chat(model: str, question: str, image_urls: List[str]):
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index a290fc7b14bcc..064a3f0df58f8 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -1,6 +1,7 @@
 # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
 # --------------------------------------------------------
-# H2OVL
+# H2OVL-Mississippi
 # Copyright (c) 2024 H2O.AI
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
@@ -30,7 +31,7 @@
                        IMG_START, IMG_END, IMG_CONTEXT)
 
 
-# Modified to include blocks generated in second pass
+# modified to include blocks generated in second pass
 def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
                          max_num: int, image_size: int,
                          use_thumbnail: bool,
@@ -43,7 +44,7 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
                         if i * j <= max_num and i * j >= min_num)
     target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
 
-    # If prior_aspect_ratio is provided, filter the target ratios
+    # if prior_aspect_ratio is provided, filter the target ratios
     if prior_aspect_ratio is not None:
         target_ratios = [ratio for ratio in target_ratios if
                          prior_aspect_ratio[0] % ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0]
@@ -95,7 +96,7 @@ def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
     return processed_images, target_aspect_ratio
 
 
-# New dynamic_preprocess2 with prior_aspect_ratio
+# new dynamic_preprocess2 with prior_aspect_ratio
 def dynamic_preprocess2(image: Image.Image, min_num: int, max_num: int,
                         image_size: int, use_thumbnail: bool, prior_aspect_ratio: Tuple[int, int]) -> List[Image.Image]:
     orig_width, orig_height = image.size
@@ -148,7 +149,7 @@ def image_to_pixel_values(image:Image.Image,
                           input_size: int, min_num: int,
                           max_num: int, use_thumbnail: bool,
                           use_MSAC: bool) -> torch.Tensor:
-    # When MSAC is turned on, we need to preprocess the image twice
+    # when MSAC is turned on, we need to process the image twice
     if use_MSAC:
         pixel_values, target_aspect_ratio = load_image1(image, input_size=input_size, min_num=min_num, max_num=max_num)
         pixel_values2 = load_image2(image, input_size=input_size, min_num=min_num, max_num=max_num, target_aspect_ratio=target_aspect_ratio)
@@ -185,37 +186,21 @@ def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
 def get_max_internvl_image_tokens(ctx: InputContext,
                                   *,
                                   max_dynamic_patch: Optional[int] = None):
+    """
+    Calculate the maximum number of tokens with/without MSAC and thumbnail
+    """
     hf_config = ctx.get_hf_config()
-    vision_config = hf_config.vision_config
-    
-
     use_thumbnail = hf_config.use_thumbnail
-    max_dynamic_patch = hf_config.max_dynamic_patch
     use_MSAC = hf_config.use_msac
+    
+    if max_dynamic_patch is None:
+        max_dynamic_patch = hf_config.max_dynamic_patch
 
-    # calculate the actual max_dy
-    print('The max_dynamic_patch is:', max_dynamic_patch)
-
-    image_size = vision_config.image_size
     num_patches = get_internvl_num_patches(hf_config)
-    # return num_patches * max_dynamic_patch
-   
-    min_num = hf_config.min_dynamic_patch
-    max_num = hf_config.max_dynamic_patch
-    
-    # Assuming we're calculating for a dummy image with maximum size
-    max_image_width, max_image_height = get_max_internvl_image_size(ctx, max_dynamic_patch=max_dynamic_patch)
-    dummy_image = Image.new('RGB', (max_image_width, max_image_height))
     
-    # Calculate num_blocks based on the dummy image's size
-    num_blocks = image_to_pixel_values(dummy_image,
-                                       image_size,
-                                       min_num,
-                                       max_num,
-                                       use_thumbnail=use_thumbnail,
-                                       use_MSAC=use_MSAC).shape[0]
-    
-    # Return the final token count: num_blocks * num_patches
+    coefficient = 2 if use_MSAC else 1
+    num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0)
+
     return num_blocks * num_patches
 
 
@@ -337,8 +322,6 @@ def _init_vision_model(
             else:
                 num_hidden_layers = vision_feature_layer + 1
 
-            # We added additional dummy heads to the original num of heads to
-            # make the number of heads divisible by 8.
             return InternVisionModel(
                 config.vision_config,
                 quant_config=quant_config,

From c737726b5c8799ffe22fe4ad1d1ff3c4e69b22d0 Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Mon, 28 Oct 2024 06:17:51 +0000
Subject: [PATCH 34/43] format

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 examples/offline_inference_vision_language.py |   2 +-
 ...e_inference_vision_language_multi_image.py |   3 +-
 vllm/entrypoints/chat_utils.py                |   3 +-
 vllm/model_executor/models/h2ovl.py           | 249 +++++++++++-------
 vllm/model_executor/models/registry.py        |   2 +-
 vllm/transformers_utils/config.py             |   3 +-
 vllm/transformers_utils/configs/__init__.py   |   3 +-
 vllm/transformers_utils/configs/h2ovl.py      |   7 +-
 8 files changed, 172 insertions(+), 100 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 5419115a35b68..4fd002caf1763 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -501,4 +501,4 @@ def main(args):
                         default=16,
                         help='Number of frames to extract from the video.')
     args = parser.parse_args()
-    main(args)
+    main(args)
\ No newline at end of file
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 15695af846935..d99684078ff3d 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -106,6 +106,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
         chat_template=None,
     )
 
+
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-2b"
 
@@ -139,6 +140,7 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
         chat_template=None,
     )
 
+
 def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -318,7 +320,6 @@ def run_generate(model, question: str, image_urls: List[str]):
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
-        
 
 
 def run_chat(model: str, question: str, image_urls: List[str]):
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index df4de9483c455..c9552977710d1 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -187,7 +187,8 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "internvl_chat", "NVLM_D", "h2ovl_chat"):
+            if model_type in ("chameleon", "internvl_chat", "NVLM_D",
+                              "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 064a3f0df58f8..233ede4a48be3 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -5,12 +5,11 @@
 # Copyright (c) 2024 H2O.AI
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
-from typing import Optional, Tuple, List, Mapping
 from functools import partial
-from PIL import Image
+from typing import List, Optional, Tuple
 
 import torch
-import torch.nn as nn
+from PIL import Image
 from transformers import PretrainedConfig
 
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
@@ -22,20 +21,21 @@
 from vllm.utils import is_list_of
 
 from .intern_vit import InternVisionModel
-from .internvl import (InternVLChatModel, 
-                       InternVLInputPipeline,
-                       build_transform,
-                       find_closest_aspect_ratio,
-                       get_internvl_num_patches,
-                       get_max_internvl_image_size,
-                       IMG_START, IMG_END, IMG_CONTEXT)
+from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, InternVLChatModel,
+                       InternVLInputPipeline, build_transform,
+                       find_closest_aspect_ratio, get_internvl_num_patches)
 
 
 # modified to include blocks generated in second pass
-def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
-                         max_num: int, image_size: int,
-                         use_thumbnail: bool,
-                         prior_aspect_ratio=None) -> Tuple[int, int, int, Tuple[int, int]]:
+def calculate_num_blocks(
+    orig_width: int,
+    orig_height: int,
+    min_num: int,
+    max_num: int,
+    image_size: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio=None,
+) -> Tuple[int, int, int, Tuple[int, int]]:
     aspect_ratio = orig_width / orig_height
 
     # calculate the existing image aspect ratio
@@ -46,8 +46,10 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
 
     # if prior_aspect_ratio is provided, filter the target ratios
     if prior_aspect_ratio is not None:
-        target_ratios = [ratio for ratio in target_ratios if
-                         prior_aspect_ratio[0] % ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0]
+        target_ratios = [
+            ratio for ratio in target_ratios if prior_aspect_ratio[0] %
+            ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0
+        ]
 
     # find the closest aspect ratio to the target
     target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
@@ -65,27 +67,35 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
-                       image_size: int,
-                       use_thumbnail: bool) -> Tuple[List[Image.Image], Tuple[int, int]]:
+def dynamic_preprocess(
+    image: Image.Image,
+    min_num: int,
+    max_num: int,
+    image_size: int,
+    use_thumbnail: bool,
+) -> Tuple[List[Image.Image], Tuple[int, int]]:
     orig_width, orig_height = image.size
 
     # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height, target_aspect_ratio = calculate_num_blocks(
-        orig_width,
-        orig_height,
-        min_num,
-        max_num,
-        image_size,
-        use_thumbnail=False)
+    blocks, target_width, target_height, target_aspect_ratio = (
+        calculate_num_blocks(
+            orig_width,
+            orig_height,
+            min_num,
+            max_num,
+            image_size,
+            use_thumbnail=False,
+        ))
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []
     for i in range(blocks):
-        box = ((i % (target_width // image_size)) * image_size,
-               (i // (target_width // image_size)) * image_size,
-               ((i % (target_width // image_size)) + 1) * image_size,
-               ((i // (target_width // image_size)) + 1) * image_size)
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
         # split the image
         split_img = resized_img.crop(box)
         processed_images.append(split_img)
@@ -97,8 +107,14 @@ def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
 
 
 # new dynamic_preprocess2 with prior_aspect_ratio
-def dynamic_preprocess2(image: Image.Image, min_num: int, max_num: int,
-                        image_size: int, use_thumbnail: bool, prior_aspect_ratio: Tuple[int, int]) -> List[Image.Image]:
+def dynamic_preprocess2(
+    image: Image.Image,
+    min_num: int,
+    max_num: int,
+    image_size: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio: Tuple[int, int],
+) -> List[Image.Image]:
     orig_width, orig_height = image.size
 
     # calculate the number of blocks based on prior aspect ratio
@@ -109,15 +125,18 @@ def dynamic_preprocess2(image: Image.Image, min_num: int, max_num: int,
         max_num,
         image_size,
         use_thumbnail=False,
-        prior_aspect_ratio=prior_aspect_ratio)
+        prior_aspect_ratio=prior_aspect_ratio,
+    )
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []
     for i in range(blocks):
-        box = ((i % (target_width // image_size)) * image_size,
-               (i // (target_width // image_size)) * image_size,
-               ((i % (target_width // image_size)) + 1) * image_size,
-               ((i // (target_width // image_size)) + 1) * image_size)
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
         # split the image
         split_img = resized_img.crop(box)
         processed_images.append(split_img)
@@ -128,45 +147,82 @@ def dynamic_preprocess2(image: Image.Image, min_num: int, max_num: int,
     return processed_images
 
 
-def load_image1(image:Image.Image, input_size=448, min_num=1, max_num=6):
+def load_image1(image: Image.Image, input_size=448, min_num=1, max_num=6):
     # image = Image.open(image_file).convert('RGB')
     transform = build_transform(input_size=input_size)
-    images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
+    images, target_aspect_ratio = dynamic_preprocess(
+        image,
+        image_size=input_size,
+        use_thumbnail=True,
+        min_num=min_num,
+        max_num=max_num,
+    )
     pixel_values = [transform(image) for image in images]
     pixel_values = torch.stack(pixel_values)
     return pixel_values, target_aspect_ratio
 
-def load_image2(image:Image.Image, input_size=448, min_num=1, max_num=6, target_aspect_ratio=None):
+
+def load_image2(
+    image: Image.Image,
+    input_size=448,
+    min_num=1,
+    max_num=6,
+    target_aspect_ratio=None,
+):
     # image = Image.open(image_file).convert('RGB')
     transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
+    images = dynamic_preprocess2(
+        image,
+        image_size=input_size,
+        use_thumbnail=True,
+        min_num=min_num,
+        max_num=max_num,
+        prior_aspect_ratio=target_aspect_ratio,
+    )
     pixel_values = [transform(image) for image in images]
     pixel_values = torch.stack(pixel_values)
     return pixel_values
 
 
-def image_to_pixel_values(image:Image.Image,
-                          input_size: int, min_num: int,
-                          max_num: int, use_thumbnail: bool,
-                          use_MSAC: bool) -> torch.Tensor:
+def image_to_pixel_values(
+    image: Image.Image,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    use_MSAC: bool,
+) -> torch.Tensor:
     # when MSAC is turned on, we need to process the image twice
     if use_MSAC:
-        pixel_values, target_aspect_ratio = load_image1(image, input_size=input_size, min_num=min_num, max_num=max_num)
-        pixel_values2 = load_image2(image, input_size=input_size, min_num=min_num, max_num=max_num, target_aspect_ratio=target_aspect_ratio)
-        pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
+        pixel_values, target_aspect_ratio = load_image1(image,
+                                                        input_size=input_size,
+                                                        min_num=min_num,
+                                                        max_num=max_num)
+        pixel_values2 = load_image2(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            target_aspect_ratio=target_aspect_ratio,
+        )
+        pixel_values = torch.cat(
+            [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
 
     else:
         transform = build_transform(input_size=input_size)
-        images, _ = dynamic_preprocess(image,
-                                    min_num=min_num,
-                                    max_num=max_num,
-                                    image_size=input_size,
-                                    use_thumbnail=use_thumbnail)
+        images, _ = dynamic_preprocess(
+            image,
+            min_num=min_num,
+            max_num=max_num,
+            image_size=input_size,
+            use_thumbnail=use_thumbnail,
+        )
         pixel_values = [transform(image) for image in images]
-        pixel_values = torch.stack(pixel_values)    
+        pixel_values = torch.stack(pixel_values)
 
     return pixel_values
 
+
 def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
                                   max_dynamic_patch: Optional[int] = None):
     image_size = hf_config.vision_config.image_size
@@ -175,12 +231,14 @@ def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
         max_dynamic_patch = hf_config.max_dynamic_patch
     use_thumbnail = hf_config.use_thumbnail
     use_MSAC = hf_config.use_msac
-    return partial(image_to_pixel_values,
-                   input_size=image_size,
-                   min_num=min_num,
-                   max_num=max_dynamic_patch,
-                   use_thumbnail=use_thumbnail,
-                   use_MSAC=use_MSAC)
+    return partial(
+        image_to_pixel_values,
+        input_size=image_size,
+        min_num=min_num,
+        max_num=max_dynamic_patch,
+        use_thumbnail=use_thumbnail,
+        use_MSAC=use_MSAC,
+    )
 
 
 def get_max_internvl_image_tokens(ctx: InputContext,
@@ -192,12 +250,12 @@ def get_max_internvl_image_tokens(ctx: InputContext,
     hf_config = ctx.get_hf_config()
     use_thumbnail = hf_config.use_thumbnail
     use_MSAC = hf_config.use_msac
-    
+
     if max_dynamic_patch is None:
         max_dynamic_patch = hf_config.max_dynamic_patch
 
     num_patches = get_internvl_num_patches(hf_config)
-    
+
     coefficient = 2 if use_MSAC else 1
     num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0)
 
@@ -205,32 +263,35 @@ def get_max_internvl_image_tokens(ctx: InputContext,
 
 
 class H2OVLInputPipeline(InternVLInputPipeline):
+
     def __init__(self):
         super().__init__(IMG_START, IMG_END, IMG_CONTEXT)
+
     def input_processor(
-        self, 
+        self,
         ctx: InputContext,
         inputs: DecoderOnlyInputs,
-        *, 
+        *,
         max_dynamic_patch: Optional[int] = None,
     ) -> DecoderOnlyInputs:
         multi_modal_data = inputs.get("multi_modal_data")
         if multi_modal_data is None or "image" not in multi_modal_data:
             return inputs
-        
+
         model_config = ctx.model_config
         hf_config = ctx.get_hf_config()
-        
+
         image_data = multi_modal_data["image"]
         num_patches = get_internvl_num_patches(hf_config)
-        
+
         # can only get the total blocks num after the image fully processed
-        num_blocks_calculator = image_to_pixel_values_wrapper(hf_config, max_dynamic_patch=max_dynamic_patch)
-        
+        num_blocks_calculator = image_to_pixel_values_wrapper(
+            hf_config, max_dynamic_patch=max_dynamic_patch)
+
         if isinstance(image_data, Image.Image):
             num_blocks = num_blocks_calculator(image_data).shape[0]
             image_feature_sizes = [num_blocks * num_patches]
-        
+
         elif is_list_of(image_data, Image.Image):
             # Do not use MSAC for multi images
             hf_config.use_msac = False
@@ -238,64 +299,70 @@ def input_processor(
             for image in image_data:
                 num_blocks = num_blocks_calculator(image).shape[0]
                 image_feature_sizes.append(num_blocks * num_patches)
-                
+
         elif isinstance(image_data, torch.Tensor):
             num_images, image_feature_size, hidden_size = image_data.shape
             image_feature_sizes = [image_feature_size]
         else:
             raise TypeError(f"Invalid image type: {type(image_data)}")
-        
+
         tokenizer = cached_get_tokenizer(
             model_config.tokenizer,
-            trust_remote_code=model_config.trust_remote_code)
-        
+            trust_remote_code=model_config.trust_remote_code,
+        )
+
         prompt = inputs.get("prompt")
         prompt_token_ids = inputs["prompt_token_ids"]
         if prompt is None:
             prompt = tokenizer.decode(prompt_token_ids)
-            
+
         new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
                                                num_patches)
         new_prompt_token_ids = tokenizer.encode(new_prompt)
 
-        return token_inputs(prompt=prompt,
-                            prompt_token_ids=new_prompt_token_ids,
-                            multi_modal_data=multi_modal_data)
-        
+        return token_inputs(
+            prompt=prompt,
+            prompt_token_ids=new_prompt_token_ids,
+            multi_modal_data=multi_modal_data,
+        )
+
     def input_mapper(
-        self, 
+        self,
         ctx: InputContext,
         data: object,
         *,
         max_dynamic_patch: Optional[int] = None,
     ):
         hf_config = ctx.get_hf_config()
-        
+
         image_pixel_values_mapper = image_to_pixel_values_wrapper(
             hf_config, max_dynamic_patch)
-        
+
         if isinstance(data, Image.Image):
             data = image_pixel_values_mapper(data)
             data = data.unsqueeze(0)
         elif is_list_of(data, Image.Image):
-            hf_config.use_msac = False 
+            hf_config.use_msac = False
             data = [image_pixel_values_mapper(img) for img in data]
-            
+
         else:
             return MultiModalInputs({"image_embeds": data})
         model_config = ctx.model_config
         tokenizer = cached_get_tokenizer(
             model_config.tokenizer,
-            trust_remote_code=model_config.trust_remote_code)
-        image_token_id = tokenizer.encode(self.img_context_token,
-                                          add_special_tokens=False,
-                                          return_tensors="pt")[0]
+            trust_remote_code=model_config.trust_remote_code,
+        )
+        image_token_id = tokenizer.encode(
+            self.img_context_token,
+            add_special_tokens=False,
+            return_tensors="pt",
+        )[0]
 
         return MultiModalInputs({
             "pixel_values": data,
             "image_token_id": image_token_id
         })
-        
+
 
 input_pipeline = H2OVLInputPipeline()
 
@@ -317,8 +384,8 @@ def _init_vision_model(
         if not is_mono:
             vision_feature_layer = config.select_layer
             if vision_feature_layer < 0:
-                num_hidden_layers = config.vision_config.num_hidden_layers \
-                    + vision_feature_layer + 1
+                num_hidden_layers = (config.vision_config.num_hidden_layers +
+                                     vision_feature_layer + 1)
             else:
                 num_hidden_layers = vision_feature_layer + 1
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 6810c9bc94390..3a929f5cb5195 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -483,4 +483,4 @@ def _run() -> None:
 
 
 if __name__ == "__main__":
-    _run()
+    _run()
\ No newline at end of file
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 33ca5d0b2f639..08697274854e0 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -18,7 +18,8 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             EAGLEConfig, ExaoneConfig, H2OVLChatConfig,
+                                             EAGLEConfig, ExaoneConfig,
+                                             H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index aedfe925a07cd..d1e19c9a33c24 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -23,6 +23,7 @@
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
+    "H2OVLChatConfig",
     "InternVLChatConfig",
     "JAISConfig",
     "MedusaConfig",
@@ -34,4 +35,4 @@
     "NVLM_D_Config",
     "SolarConfig",
     "UltravoxConfig",
-]
+]
\ No newline at end of file
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
index 7d4941c0120b8..b94c5b77e4b7f 100644
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -1,12 +1,13 @@
 # Adapted from
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
 # --------------------------------------------------------
-# H2OVL
-# Copyright (c) 2024 H2O.ai
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 
 from .internvl import InternVLChatConfig
 
+
 class H2OVLChatConfig(InternVLChatConfig):
-    model_type = "h2ovl_chat"
\ No newline at end of file
+    model_type = "h2ovl_chat"

From bc8d3f158b87c2bf99b1748d2066901a7311695d Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Tue, 29 Oct 2024 23:08:00 +0000
Subject: [PATCH 35/43] Refactor code to eliminate duplicate implementations
 and avoid redundant image preprocessing calls

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 vllm/model_executor/models/h2ovl.py | 156 +++++++++++++---------------
 1 file changed, 70 insertions(+), 86 deletions(-)

diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 233ede4a48be3..234bcb1e9eb1b 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -67,16 +67,18 @@ def calculate_num_blocks(
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+# refactored to handle prior_aspect_ratio as optional
 def dynamic_preprocess(
     image: Image.Image,
     min_num: int,
     max_num: int,
     image_size: int,
     use_thumbnail: bool,
+    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
 ) -> Tuple[List[Image.Image], Tuple[int, int]]:
     orig_width, orig_height = image.size
 
-    # calculate the number of blocks without thumbnail
+    # calculate the number of blocks based on prior aspect ratio if available
     blocks, target_width, target_height, target_aspect_ratio = (
         calculate_num_blocks(
             orig_width,
@@ -85,6 +87,7 @@ def dynamic_preprocess(
             max_num,
             image_size,
             use_thumbnail=False,
+            prior_aspect_ratio=prior_aspect_ratio,
         ))
     # resize the image
     resized_img = image.resize((target_width, target_height))
@@ -106,84 +109,29 @@ def dynamic_preprocess(
     return processed_images, target_aspect_ratio
 
 
-# new dynamic_preprocess2 with prior_aspect_ratio
-def dynamic_preprocess2(
-    image: Image.Image,
-    min_num: int,
-    max_num: int,
-    image_size: int,
-    use_thumbnail: bool,
-    prior_aspect_ratio: Tuple[int, int],
-) -> List[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks based on prior aspect ratio
-    blocks, target_width, target_height, _ = calculate_num_blocks(
-        orig_width,
-        orig_height,
-        min_num,
-        max_num,
-        image_size,
-        use_thumbnail=False,
-        prior_aspect_ratio=prior_aspect_ratio,
-    )
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images
-
-
-def load_image1(image: Image.Image, input_size=448, min_num=1, max_num=6):
-    # image = Image.open(image_file).convert('RGB')
-    transform = build_transform(input_size=input_size)
-    images, target_aspect_ratio = dynamic_preprocess(
-        image,
-        image_size=input_size,
-        use_thumbnail=True,
-        min_num=min_num,
-        max_num=max_num,
-    )
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
-    return pixel_values, target_aspect_ratio
-
-
-def load_image2(
+def load_image(
     image: Image.Image,
     input_size=448,
     min_num=1,
     max_num=6,
-    target_aspect_ratio=None,
-):
-    # image = Image.open(image_file).convert('RGB')
+    use_thumbnail=True,
+    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
     transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess2(
+    images, target_aspect_ratio = dynamic_preprocess(
         image,
         image_size=input_size,
-        use_thumbnail=True,
+        use_thumbnail=use_thumbnail,
         min_num=min_num,
         max_num=max_num,
-        prior_aspect_ratio=target_aspect_ratio,
+        prior_aspect_ratio=prior_aspect_ratio,
     )
     pixel_values = [transform(image) for image in images]
     pixel_values = torch.stack(pixel_values)
-    return pixel_values
+    return pixel_values, target_aspect_ratio
 
 
+# refactored to use the combined load_image function
 def image_to_pixel_values(
     image: Image.Image,
     input_size: int,
@@ -194,31 +142,34 @@ def image_to_pixel_values(
 ) -> torch.Tensor:
     # when MSAC is turned on, we need to process the image twice
     if use_MSAC:
-        pixel_values, target_aspect_ratio = load_image1(image,
-                                                        input_size=input_size,
-                                                        min_num=min_num,
-                                                        max_num=max_num)
-        pixel_values2 = load_image2(
+        # first pass
+        pixel_values, target_aspect_ratio = load_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            use_thumbnail=True,
+        )
+        # second pass
+        pixel_values2, _ = load_image(
             image,
             input_size=input_size,
             min_num=min_num,
             max_num=max_num,
-            target_aspect_ratio=target_aspect_ratio,
+            prior_aspect_ratio=target_aspect_ratio,
         )
+        # combine pixel values
         pixel_values = torch.cat(
             [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
 
     else:
-        transform = build_transform(input_size=input_size)
-        images, _ = dynamic_preprocess(
+        pixel_values, _ = load_image(
             image,
+            input_size=input_size,
             min_num=min_num,
             max_num=max_num,
-            image_size=input_size,
             use_thumbnail=use_thumbnail,
         )
-        pixel_values = [transform(image) for image in images]
-        pixel_values = torch.stack(pixel_values)
 
     return pixel_values
 
@@ -263,6 +214,9 @@ def get_max_internvl_image_tokens(ctx: InputContext,
 
 
 class H2OVLInputPipeline(InternVLInputPipeline):
+    """
+    Input pipeline for processing image and text data for the H2OVL model.
+    """
 
     def __init__(self):
         super().__init__(IMG_START, IMG_END, IMG_CONTEXT)
@@ -274,6 +228,7 @@ def input_processor(
         *,
         max_dynamic_patch: Optional[int] = None,
     ) -> DecoderOnlyInputs:
+        # get multi_modal_data
         multi_modal_data = inputs.get("multi_modal_data")
         if multi_modal_data is None or "image" not in multi_modal_data:
             return inputs
@@ -284,25 +239,32 @@ def input_processor(
         image_data = multi_modal_data["image"]
         num_patches = get_internvl_num_patches(hf_config)
 
-        # can only get the total blocks num after the image fully processed
-        num_blocks_calculator = image_to_pixel_values_wrapper(
+        image_pixel_values_mapper = image_to_pixel_values_wrapper(
             hf_config, max_dynamic_patch=max_dynamic_patch)
 
+        # single image
         if isinstance(image_data, Image.Image):
-            num_blocks = num_blocks_calculator(image_data).shape[0]
+            pixel_values = image_pixel_values_mapper(image_data)
+            num_blocks = pixel_values.shape[0]
             image_feature_sizes = [num_blocks * num_patches]
+            pixel_values = pixel_values.unsqueeze(0)
 
+        # multi images
         elif is_list_of(image_data, Image.Image):
             # Do not use MSAC for multi images
             hf_config.use_msac = False
             image_feature_sizes = []
-            for image in image_data:
-                num_blocks = num_blocks_calculator(image).shape[0]
+            pixel_values = [
+                image_pixel_values_mapper(image) for image in image_data
+            ]
+            for pixel_value in pixel_values:
+                num_blocks = pixel_value.shape[0]
                 image_feature_sizes.append(num_blocks * num_patches)
 
         elif isinstance(image_data, torch.Tensor):
             num_images, image_feature_size, hidden_size = image_data.shape
             image_feature_sizes = [image_feature_size]
+            pixel_values = image_data
         else:
             raise TypeError(f"Invalid image type: {type(image_data)}")
 
@@ -320,6 +282,20 @@ def input_processor(
                                                num_patches)
         new_prompt_token_ids = tokenizer.encode(new_prompt)
 
+        # Wrap image processing in input_processor to avoid duplication
+        image_token_id = tokenizer.encode(
+            self.img_context_token,
+            add_special_tokens=False,
+            return_tensors="pt",
+        )[0]
+        # Prepare image_data dictionary
+        image_data = {
+            "pixel_values": pixel_values,
+            "image_token_id": image_token_id,
+        }
+        # Update multi_modal_data
+        multi_modal_data = {"image": image_data}
+
         return token_inputs(
             prompt=prompt,
             prompt_token_ids=new_prompt_token_ids,
@@ -332,18 +308,26 @@ def input_mapper(
         data: object,
         *,
         max_dynamic_patch: Optional[int] = None,
-    ):
+    ) -> MultiModalInputs:
+
+        # NOTE: when preprocessing for the image data is done in the
+        # 'input_processor' function
+        if isinstance(data, dict):
+            return MultiModalInputs(data)
+
+        # these is only for dummy data
         hf_config = ctx.get_hf_config()
 
         image_pixel_values_mapper = image_to_pixel_values_wrapper(
             hf_config, max_dynamic_patch)
 
         if isinstance(data, Image.Image):
-            data = image_pixel_values_mapper(data)
-            data = data.unsqueeze(0)
+            pixel_values = image_pixel_values_mapper(data)
+            pixel_values = pixel_values.unsqueeze(0)
+
         elif is_list_of(data, Image.Image):
             hf_config.use_msac = False
-            data = [image_pixel_values_mapper(img) for img in data]
+            pixel_values = [image_pixel_values_mapper(img) for img in data]
 
         else:
             return MultiModalInputs({"image_embeds": data})
@@ -359,7 +343,7 @@ def input_mapper(
         )[0]
 
         return MultiModalInputs({
-            "pixel_values": data,
+            "pixel_values": pixel_values,
             "image_token_id": image_token_id
         })
 

From 05df66da6c402f925fced3601d81b1c343e59945 Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Wed, 30 Oct 2024 13:20:48 +0000
Subject: [PATCH 36/43] expose use_MSAC for image_pixel_values_mapper

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 vllm/model_executor/models/h2ovl.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 234bcb1e9eb1b..098830ed6cf66 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -175,13 +175,15 @@ def image_to_pixel_values(
 
 
 def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
-                                  max_dynamic_patch: Optional[int] = None):
+                                  max_dynamic_patch: Optional[int] = None,
+                                  use_MSAC: Optional[bool] = None):
     image_size = hf_config.vision_config.image_size
     min_num = hf_config.min_dynamic_patch
     if max_dynamic_patch is None:
         max_dynamic_patch = hf_config.max_dynamic_patch
+    if use_MSAC is None:
+        use_MSAC = hf_config.use_msac
     use_thumbnail = hf_config.use_thumbnail
-    use_MSAC = hf_config.use_msac
     return partial(
         image_to_pixel_values,
         input_size=image_size,
@@ -218,9 +220,6 @@ class H2OVLInputPipeline(InternVLInputPipeline):
     Input pipeline for processing image and text data for the H2OVL model.
     """
 
-    def __init__(self):
-        super().__init__(IMG_START, IMG_END, IMG_CONTEXT)
-
     def input_processor(
         self,
         ctx: InputContext,
@@ -235,6 +234,7 @@ def input_processor(
 
         model_config = ctx.model_config
         hf_config = ctx.get_hf_config()
+        use_MSAC = hf_config.use_msac
 
         image_data = multi_modal_data["image"]
         num_patches = get_internvl_num_patches(hf_config)
@@ -244,7 +244,8 @@ def input_processor(
 
         # single image
         if isinstance(image_data, Image.Image):
-            pixel_values = image_pixel_values_mapper(image_data)
+            pixel_values = image_pixel_values_mapper(image_data,
+                                                     use_MSAC=use_MSAC)
             num_blocks = pixel_values.shape[0]
             image_feature_sizes = [num_blocks * num_patches]
             pixel_values = pixel_values.unsqueeze(0)
@@ -252,10 +253,10 @@ def input_processor(
         # multi images
         elif is_list_of(image_data, Image.Image):
             # Do not use MSAC for multi images
-            hf_config.use_msac = False
             image_feature_sizes = []
             pixel_values = [
-                image_pixel_values_mapper(image) for image in image_data
+                image_pixel_values_mapper(image, use_MSAC=False)
+                for image in image_data
             ]
             for pixel_value in pixel_values:
                 num_blocks = pixel_value.shape[0]
@@ -348,7 +349,7 @@ def input_mapper(
         })
 
 
-input_pipeline = H2OVLInputPipeline()
+input_pipeline = H2OVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)

From 41ef461134b3c4ea25221715a9f62e7a8747de15 Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Thu, 31 Oct 2024 20:09:21 +0000
Subject: [PATCH 37/43] added tests for image preprocessing and model output

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 .../vision_language/test_h2ovl.py             | 340 ++++++++++++++++++
 1 file changed, 340 insertions(+)
 create mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py

diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
new file mode 100644
index 0000000000000..541f427407b13
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -0,0 +1,340 @@
+import types
+from typing import List, Optional, Tuple, Type, Union
+import pytest
+import torch
+from PIL.Image import Image
+from transformers import AutoConfig
+from vllm.multimodal.utils import rescale_image_size
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close
+from .test_internvl import generate
+
+# Import the functions to test
+from vllm.model_executor.models.h2ovl import (
+    image_to_pixel_values_wrapper,
+    calculate_num_blocks,
+    IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values
+)
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|prompt|><image>\nWhat's the content in the center of the image?<|end|><|answer|>",  # noqa: E501
+    "cherry_blossom":
+    "<|prompt|><image>\nWhat is the season?<|end|><|answer|>",  # noqa: E501
+})
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|prompt|>Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.<|end|><|answer|>"  # noqa: E501
+
+models = [
+    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
+    "h2oai/h2ovl-mississippi-2b",
+]
+target_dtype = "bfloat16"
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    class H2OVLProcessor:
+        """A simple processor for H2OVL models."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+            self.dtype = hf_runner.model.dtype
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, List[Image]],
+                     **kwargs):
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values(image, self.image_size, self.min_num,
+                                      self.max_num,
+                                      self.use_thumbnail,
+                                      use_MSAC=self.config.use_msac).to(self.dtype)
+                for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=8192,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+            "<IMG_CONTEXT>")
+        hf_model.model.img_context_token_id = img_context_token_id
+        hf_model.processor = H2OVLProcessor(hf_model)
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.get_output_embeddings()
+        hf_model.model.generate = types.MethodType(generate, hf_model.model)
+        eos_token_id = hf_model.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.5, 0.75, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_tokens: int,
+                             num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+def run_preprocessing_test(
+    image: Image,
+    config,
+    max_dynamic_patch: Optional[int] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Test the image preprocessing and calculate expected blocks."""
+
+    if max_dynamic_patch is None:
+        max_dynamic_patch = config.max_dynamic_patch
+
+    width, height = image.size
+    use_MSAC = config.use_msac
+
+    # Create the mapper function with the provided configuration
+    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
+    pixel_values = mapper(image)
+
+    # Calculate the expected number of blocks
+    if use_MSAC:
+        # First pass
+        blocks1, _, _, aspect_ratio = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,  # Thumbnail is handled separately
+            prior_aspect_ratio=None,
+        )
+
+        # Second pass
+        blocks2, _, _, _ = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,
+            prior_aspect_ratio=aspect_ratio,
+        )
+
+        # Add thumbnail if use_thumbnail is True and total_blocks > 1
+        if config.use_thumbnail:
+            blocks1 +=1 if blocks1 > 1 else 0
+            blocks2 +=1 if blocks2 > 1 else 0
+
+            
+        # Total blocks is the sum of blocks from both passes minus overlapping
+        total_blocks = blocks1 + blocks2 -1    
+            
+        expected_blocks = total_blocks
+
+    else:
+        blocks, _, _, _ = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,
+            prior_aspect_ratio=None,
+        )
+        expected_blocks = blocks
+
+        if config.use_thumbnail and expected_blocks > 1:
+            expected_blocks += 1
+
+    return pixel_values, expected_blocks
+
+@pytest.mark.parametrize("model_name", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("max_dynamic_patch", [None, 2,4,8])
+def test_image_preprocessing(image_assets, model_name, size_factors, max_dynamic_patch):
+    """Test image preprocessing pipeline with different configurations."""
+    # Load the configuration from the model
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    for asset in image_assets:
+        image = asset.pil_image
+        for factor in size_factors:
+            scaled_image = rescale_image_size(image, factor)
+
+            # Test preprocessing and get expected number of blocks
+            pixel_values, expected_blocks = run_preprocessing_test(
+                scaled_image, config, max_dynamic_patch
+            )
+
+            # Verify output shapes and properties
+            actual_blocks = pixel_values.shape[0]
+            assert actual_blocks == expected_blocks, (
+                f"Expected {expected_blocks} blocks, got {actual_blocks}"
+            )
+
+            # Check image dimensions
+            expected_size = (
+                3,  # Number of channels (C, H, W)
+                config.vision_config.image_size,
+                config.vision_config.image_size,
+            )
+            for img in pixel_values:
+                assert img.shape == expected_size, (
+                    f"Expected image size {expected_size}, got {img.shape}"
+                )
\ No newline at end of file

From 4792e8ccb2a31c1df14c36549585fa3b1194b3e6 Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Thu, 31 Oct 2024 21:08:20 +0000
Subject: [PATCH 38/43] sync to the latest and fix the tests for h2ovl

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 .../vision_language/test_h2ovl.py             | 242 ++----------------
 .../vision_language/test_models.py            |  17 ++
 .../vision_language/vlm_utils/model_utils.py  |  57 +++++
 3 files changed, 90 insertions(+), 226 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
index 541f427407b13..ad9aa3104750b 100644
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -1,29 +1,14 @@
-import types
-from typing import List, Optional, Tuple, Type, Union
+from typing import Optional, Tuple
+
 import pytest
 import torch
 from PIL.Image import Image
 from transformers import AutoConfig
-from vllm.multimodal.utils import rescale_image_size
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                          _ImageAssets)
-from ...utils import check_logprobs_close
-from .test_internvl import generate
 
 # Import the functions to test
-from vllm.model_executor.models.h2ovl import (
-    image_to_pixel_values_wrapper,
-    calculate_num_blocks,
-    IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values
-)
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|prompt|><image>\nWhat's the content in the center of the image?<|end|><|answer|>",  # noqa: E501
-    "cherry_blossom":
-    "<|prompt|><image>\nWhat is the season?<|end|><|answer|>",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|prompt|>Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.<|end|><|answer|>"  # noqa: E501
+from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
+                                              image_to_pixel_values_wrapper)
+from vllm.multimodal.utils import rescale_image_size
 
 models = [
     "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
@@ -32,199 +17,6 @@
 target_dtype = "bfloat16"
 
 
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    class H2OVLProcessor:
-        """A simple processor for H2OVL models."""
-
-        def __init__(self, hf_runner: HfRunner):
-            self.num_image_token = hf_runner.model.num_image_token
-            self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
-
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
-            self.vision_config = self.config.vision_config
-            self.use_thumbnail = self.config.use_thumbnail
-            self.min_num = self.config.min_dynamic_patch
-            self.max_num = self.config.max_dynamic_patch
-            self.image_size = self.vision_config.image_size
-
-        def __call__(self, text: str, images: Union[Image, List[Image]],
-                     **kwargs):
-            images = [images] if isinstance(images, Image) else images
-            pixel_values = [
-                image_to_pixel_values(image, self.image_size, self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail,
-                                      use_MSAC=self.config.use_msac).to(self.dtype)
-                for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
-            ]
-            pixel_values = torch.cat(pixel_values, dim=0)
-            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
-                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
-            prompt = self.tokenizer(text, return_tensors="pt")
-            prompt.update({"pixel_values": pixel_values})
-            return prompt
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     max_model_len=8192,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-            "<IMG_CONTEXT>")
-        hf_model.model.img_context_token_id = img_context_token_id
-        hf_model.processor = H2OVLProcessor(hf_model)
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.language_model.get_output_embeddings()
-        hf_model.model.generate = types.MethodType(generate, hf_model.model)
-        eos_token_id = hf_model.tokenizer.eos_token_id
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=hf_images,
-                                                    eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.5, 0.75, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_tokens: int,
-                             num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
-
-
 def run_preprocessing_test(
     image: Image,
     config,
@@ -268,13 +60,12 @@ def run_preprocessing_test(
 
         # Add thumbnail if use_thumbnail is True and total_blocks > 1
         if config.use_thumbnail:
-            blocks1 +=1 if blocks1 > 1 else 0
-            blocks2 +=1 if blocks2 > 1 else 0
+            blocks1 += 1 if blocks1 > 1 else 0
+            blocks2 += 1 if blocks2 > 1 else 0
 
-            
         # Total blocks is the sum of blocks from both passes minus overlapping
-        total_blocks = blocks1 + blocks2 -1    
-            
+        total_blocks = blocks1 + blocks2 - 1
+
         expected_blocks = total_blocks
 
     else:
@@ -294,6 +85,7 @@ def run_preprocessing_test(
 
     return pixel_values, expected_blocks
 
+
 @pytest.mark.parametrize("model_name", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -306,8 +98,9 @@ def run_preprocessing_test(
         [0.25, 0.5, 1.0],
     ],
 )
-@pytest.mark.parametrize("max_dynamic_patch", [None, 2,4,8])
-def test_image_preprocessing(image_assets, model_name, size_factors, max_dynamic_patch):
+@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
+def test_image_preprocessing(image_assets, model_name, size_factors,
+                             max_dynamic_patch):
     """Test image preprocessing pipeline with different configurations."""
     # Load the configuration from the model
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
@@ -319,14 +112,12 @@ def test_image_preprocessing(image_assets, model_name, size_factors, max_dynamic
 
             # Test preprocessing and get expected number of blocks
             pixel_values, expected_blocks = run_preprocessing_test(
-                scaled_image, config, max_dynamic_patch
-            )
+                scaled_image, config, max_dynamic_patch)
 
             # Verify output shapes and properties
             actual_blocks = pixel_values.shape[0]
             assert actual_blocks == expected_blocks, (
-                f"Expected {expected_blocks} blocks, got {actual_blocks}"
-            )
+                f"Expected {expected_blocks} blocks, got {actual_blocks}")
 
             # Check image dimensions
             expected_size = (
@@ -336,5 +127,4 @@ def test_image_preprocessing(image_assets, model_name, size_factors, max_dynamic
             )
             for img in pixel_values:
                 assert img.shape == expected_size, (
-                    f"Expected image size {expected_size}, got {img.shape}"
-                )
\ No newline at end of file
+                    f"Expected image size {expected_size}, got {img.shape}")
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index e49ea6f98324d..cfd2d61f2b633 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -187,6 +187,23 @@
         marks=[large_gpu_mark(min_gb=48)],
         patch_hf_runner=model_utils.glm_patch_hf_runner,
     ),
+    "h2ovl": VLMTestInfo(
+        models = [
+            "h2oai/h2ovl-mississippi-800m",
+            "h2oai/h2ovl-mississippi-2b",
+        ],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=8192,
+        dtype="bfloat16",
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
+    ),
     "intern_vl": VLMTestInfo(
         models=[
             "OpenGVLab/InternVL2-1B",
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index e925934db0e7c..3137af5dad5df 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -259,6 +259,63 @@ def processor(*args, text="", images=None, **kwargs):
     return hf_model
 
 
+def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for H2OVL."""
+
+    class H2OVLProcessor:
+        """A simple processor for H2OVL models."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+            self.dtype = hf_runner.model.dtype
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, List[Image]],
+                     **kwargs):
+            from vllm.model_executor.models.h2ovl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values(image,
+                                      self.image_size,
+                                      self.min_num,
+                                      self.max_num,
+                                      self.use_thumbnail,
+                                      use_MSAC=self.config.use_msac).to(
+                                          self.dtype) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = H2OVLProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
 def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for InternVL."""
 

From 6d6f20dca5b13018d0caea460a29cc5682314179 Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Thu, 31 Oct 2024 21:23:53 +0000
Subject: [PATCH 39/43] fixed format

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 .../decoder_only/vision_language/vlm_utils/model_utils.py   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 3137af5dad5df..b8fe1c07c621f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -280,8 +280,10 @@ def __init__(self, hf_runner: HfRunner):
 
         def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
-            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            from vllm.model_executor.models.h2ovl import (IMG_CONTEXT, IMG_END,
+                                                          IMG_START,
+                                                          image_to_pixel_values
+                                                          )
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
                 image_to_pixel_values(image,

From 840764dbc6972c7cfba8c4d4464ba24e6de2479b Mon Sep 17 00:00:00 2001
From: Shanshan Wang <shanshan.wang@h2o.ai>
Date: Thu, 31 Oct 2024 21:27:50 +0000
Subject: [PATCH 40/43] format checked by format.sh

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
---
 .../decoder_only/vision_language/vlm_utils/model_utils.py   | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index b8fe1c07c621f..3137af5dad5df 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -280,10 +280,8 @@ def __init__(self, hf_runner: HfRunner):
 
         def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
-            from vllm.model_executor.models.h2ovl import (IMG_CONTEXT, IMG_END,
-                                                          IMG_START,
-                                                          image_to_pixel_values
-                                                          )
+            from vllm.model_executor.models.h2ovl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
                 image_to_pixel_values(image,

From a1846f686372892d6d1b4b0b4502d5c864744970 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 31 Oct 2024 23:36:44 -0700
Subject: [PATCH 41/43] run yapf alone

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 .../decoder_only/vision_language/vlm_utils/model_utils.py   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 3137af5dad5df..b8fe1c07c621f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -280,8 +280,10 @@ def __init__(self, hf_runner: HfRunner):
 
         def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
-            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            from vllm.model_executor.models.h2ovl import (IMG_CONTEXT, IMG_END,
+                                                          IMG_START,
+                                                          image_to_pixel_values
+                                                          )
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
                 image_to_pixel_values(image,

From bafd757850db9fdc7b497107d37c0eae072219fd Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 1 Nov 2024 00:41:39 -0700
Subject: [PATCH 42/43] workaround yapf

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 .../decoder_only/vision_language/vlm_utils/model_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index b8fe1c07c621f..caff39c981d36 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -280,10 +280,10 @@ def __init__(self, hf_runner: HfRunner):
 
         def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
-            from vllm.model_executor.models.h2ovl import (IMG_CONTEXT, IMG_END,
-                                                          IMG_START,
-                                                          image_to_pixel_values
-                                                          )
+            # yapf: disable
+            from vllm.model_executor.models.h2ovl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            # yapf: enable
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
                 image_to_pixel_values(image,

From 31ece380097610dc0fbbffb61039acdd4a7d8fd8 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 1 Nov 2024 00:43:43 -0700
Subject: [PATCH 43/43] format

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 .../models/decoder_only/vision_language/vlm_utils/model_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index caff39c981d36..849857b4232e7 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -283,6 +283,7 @@ def __call__(self, text: str, images: Union[Image, List[Image]],
             # yapf: disable
             from vllm.model_executor.models.h2ovl import (
                 IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+
             # yapf: enable
             images = [images] if isinstance(images, Image) else images
             pixel_values = [