From 55650c83a0c386526ed04912a0c60eccca202f3e Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Thu, 31 Oct 2024 18:46:36 +0000
Subject: [PATCH 01/85] [Bugfix] Fix `illegal memory access` error with chunked
 prefill, prefix caching, block manager v2 and xformers enabled together
 (#9532)

Signed-off-by: sasha0552 <admin@sasha0552.org>
---
 tests/prefix_caching/test_prefix_caching.py | 28 +++++++++++++++++++++
 vllm/attention/backends/utils.py            |  9 ++++---
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 366b030eaa399..fd6564bbfe630 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -5,6 +5,7 @@
 import pytest
 
 from tests.kernels.utils import override_backend_env_variable
+from vllm import SamplingParams, TokensPrompt
 
 from ..models.utils import check_outputs_equal
 
@@ -12,6 +13,14 @@
     "facebook/opt-125m",
 ]
 
+UNSTABLE_PROMPT_SEQUENCE = [
+    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
+    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
+    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
+    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
+    ([0] * 588) + ([8] * 1539),
+]
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
@@ -57,3 +66,22 @@ def test_mixed_requests(
         name_0="hf",
         name_1="vllm",
     )
+
+
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+def test_unstable_prompt_sequence(
+    vllm_runner,
+    backend: str,
+    monkeypatch,
+) -> None:
+    override_backend_env_variable(monkeypatch, backend)
+
+    with vllm_runner(
+            "Qwen/Qwen2.5-0.5B-Instruct",
+            enable_chunked_prefill=True,
+            enable_prefix_caching=True,
+            max_model_len=4096,
+    ) as vllm_model:
+        for prompt in UNSTABLE_PROMPT_SEQUENCE:
+            vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+                                SamplingParams(max_tokens=1))
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index d1a44f3e8bfa6..32fccd0dfb496 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -138,7 +138,6 @@ def _add_seq_group(
             chunked_prefill_enabled: bool):
         is_prompt = inter_data.is_prompt
         block_tables = inter_data.block_tables
-        computed_block_nums = inter_data.computed_block_nums
 
         for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
              curr_sliding_window_block) in zip(
@@ -164,10 +163,14 @@ def _add_seq_group(
             # NOTE: This only works for oooooooxxx style attention.
             block_table = []
             if inter_data.prefix_cache_hit:
-                block_table = computed_block_nums
+                block_table = block_tables[seq_id]
             elif ((chunked_prefill_enabled or not is_prompt)
                   and block_tables is not None):
-                block_table = block_tables[seq_id][-curr_sliding_window_block:]
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
             self.block_tables.append(block_table)
 
             # Compute slot mapping.

From 9fb12f7848d427b6c1c29052271030a5e96bd74a Mon Sep 17 00:00:00 2001
From: Mor Zusman <mor.zusmann@gmail.com>
Date: Thu, 31 Oct 2024 22:06:25 +0200
Subject: [PATCH 02/85] [BugFix][Kernel] Fix Illegal memory access in
 causal_conv1d in H100 (#9838)

Signed-off-by: mzusman <mor.zusmann@gmail.com>
---
 csrc/mamba/causal_conv1d/causal_conv1d.cu | 34 +++++++++++++++++++++--
 tests/kernels/test_causal_conv1d.py       |  7 +++--
 tests/kernels/test_mamba_ssm.py           |  6 ++--
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index 3a464c5f327ad..498d069c05f0d 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
             typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
         }
         out += kChunkSize;
+
+        int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
+        // in case the final state is separated between the last "smem_exchange" and 
+        // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
+        // (which occurs when `final_state_position` is a non-positivie index)
+        // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
+        if (final_state_position < 0 && seqlen > kWidth){
+            input_t vals_load[kNElts] = {0};
+            if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
+                // chunk = n_chunks - 2, a segment of the final state sits in the last index
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[kNThreads - 1];
+                #pragma unroll
+                for (int w = 0; w < -final_state_position; ++w){
+                    conv_states[w] = vals_load[kNElts + final_state_position + w];
+                }
+            }
+            if ((chunk == n_chunks - 1) && tidx == 0){
+                // chunk = n_chunks - 1, the second segment of the final state first positions
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[0];
+                for (int w = -final_state_position; w < kWidth - 1; ++w){
+                    conv_states[w] = vals_load[w + final_state_position];
+                }
+                return;
+            }
+        }
     }
     // Final state is stored in the smem_exchange last token slot,
     // in case seqlen < kWidth, we would need to take the final state from the 
@@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         }
         else {
             // in case the final state is in between the threads data
-            reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
             const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
+            if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
+                // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a 
+                // illegal access error on H100.
+                // Therefore, we access last_thread + 1, only if the final state data sits there
+                reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
+            }
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
             #pragma unroll
             for (int w = 0; w < kWidth - 1; ++w){
                 conv_states[w] = x_vals_load[offset + w ];
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
index 96bfe06d74ae5..f9b11018288be 100644
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@@ -151,7 +151,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
 @pytest.mark.parametrize("has_bias", [True])
 @pytest.mark.parametrize("width", [4])
 @pytest.mark.parametrize(
-    'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
+    'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
 @pytest.mark.parametrize('dim', [64])
 @pytest.mark.parametrize('batch', [1])
 def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
@@ -420,7 +420,10 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,
 
     unpadded_out = out[:, :out_ref_tensor.shape[-1]]
     assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
-    assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(final_states[state_indices],
+                          final_states_ref[state_indices],
+                          rtol=rtol,
+                          atol=atol)
 
     causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
                              padded_state_indices, has_initial_states,
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py
index bf7ff3b5c59b8..ad05a97685351 100644
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -555,7 +555,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
     if itype == torch.bfloat16:
-        rtol, atol = 7e-2, 7e-2
+        rtol, atol = 1e-1, 1e-1
         if torch.version.hip:
             atol *= 2
     # set seed
@@ -610,8 +610,8 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
                                          dt_bias=dt_bias,
                                          dt_softplus=True)
 
-    print("Output diff max", (out - out_ref[0]).max())
-    print("Output diff mean", (out - out_ref[0]).mean())
+    print("Output diff max", (out[:batch_size] - out_ref).max())
+    print("Output diff mean", (out[:batch_size] - out_ref).mean())
     print("Output state diff max", (state[state_indices, :] - state_ref).max())
     print("Output state diff mean",
           (state[state_indices, :] - state_ref).mean())

From b63c64d95b01cc955a56bba37d055ad36aa81abd Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 31 Oct 2024 12:55:38 -1000
Subject: [PATCH 03/85] [ci/build] Configure dependabot to update pip
 dependencies  (#9811)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/dependabot.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 6fddca0d6e4b9..a21acd9671eeb 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,3 +5,19 @@ updates:
     directory: "/"
     schedule:
       interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    labels: ["dependencies"]
+    open-pull-requests-limit: 5
+    reviewers: ["khluu", "simon-mo"]
+    allow:
+      - dependency-type: "all"
+    groups:
+      patch-update:
+        applies-to: version-updates
+        update-types: ["patch"]
+      minor-update:
+        applies-to: version-updates
+        update-types: ["minor"]

From 031a7995f38d3c73b0790280cc0fa1fe25d33bff Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Thu, 31 Oct 2024 19:09:46 -0600
Subject: [PATCH 04/85] [Bugfix][Frontend] Reject guided decoding in multistep
 mode (#9892)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 docs/source/serving/compatibility_matrix.rst  |  2 +-
 .../openai/test_prompt_validation.py          | 20 +++++++++++++++++++
 vllm/engine/llm_engine.py                     |  7 +++++++
 vllm/sampling_params.py                       |  4 ++--
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index 20a81f4cad1d1..cab19e4ec5b6c 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -283,7 +283,7 @@ Feature x Feature
      - ✅
      - ✅
      - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8985>`__ 
+     - `✗ <https://github.com/vllm-project/vllm/issues/9893>`__ 
      - ?
      - ✅
      - ✅
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 58075f7023821..1ae64ef492d5b 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -35,3 +35,23 @@ async def test_out_of_vocab_token_ids():
                                             prompt=[999999],
                                             max_tokens=5,
                                             temperature=0.0)
+
+
+@pytest.mark.asyncio
+async def test_reject_multistep_with_guided_decoding():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(openai.BadRequestError,
+                           match=re.compile(
+                               '.*Guided decoding .* multi-step decoding.*')):
+            await client.completions.create(
+                model=model_name,
+                prompt="Hello",
+                max_tokens=5,
+                temperature=0.0,
+                extra_body={"response_format": {
+                    "type": "json_object"
+                }})
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3fd34fadee1ca..edef1f30a9e91 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -829,6 +829,13 @@ def add_request(
             raise ValueError(f"Got priority {priority} but "
                              "Priority scheduling is not enabled.")
 
+        if isinstance(params, SamplingParams) \
+            and (params.guided_decoding or params.logits_processors) \
+            and self.scheduler_config.num_scheduler_steps > 1:
+            raise ValueError(
+                "Guided decoding and logits processors are not supported "
+                "in multi-step decoding")
+
         if arrival_time is None:
             arrival_time = time.time()
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 5e191c6e715e0..5c6df5aaf5446 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -485,8 +485,8 @@ def __repr__(self) -> str:
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens}), "
-            f"guided_decoding={self.guided_decoding}")
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
+            f"guided_decoding={self.guided_decoding})")
 
 
 class BeamSearchParams(

From 96e0c9cbbd65ad0b8ad20611b90bcc86a8559aae Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 31 Oct 2024 21:56:09 -0700
Subject: [PATCH 05/85] [torch.compile] directly register custom op (#9896)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_simple.py        | 20 ++++--
 tests/compile/piecewise/test_toy_llama.py     | 20 ++++--
 vllm/attention/backends/flash_attn.py         | 16 +++--
 vllm/attention/backends/flashinfer.py         | 17 +++--
 vllm/distributed/parallel_state.py            | 34 +++++++---
 .../layers/fused_moe/fused_marlin_moe.py      | 25 +++++--
 .../layers/fused_moe/fused_moe.py             | 68 +++++++++++--------
 vllm/utils.py                                 | 45 ++++++++++++
 vllm/v1/attention/backends/flash_attn.py      | 14 ++--
 9 files changed, 192 insertions(+), 67 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index a34d33efba1d8..d151d62516b07 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -6,18 +6,22 @@
 
 import torch
 from torch import nn
+from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
+from vllm.utils import direct_register_custom_op
 
 os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
 
 global_counter = 0
 
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
 
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
 def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     out: torch.Tensor) -> None:
     global global_counter
@@ -27,12 +31,20 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     out[0] += 1
 
 
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-      out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
     return
 
 
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
 @support_torch_compile
 class SillyModel(nn.Module):
 
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index db6a983d70feb..e3e5a7d0fc5a5 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,6 +8,7 @@
 
 import torch
 from torch import nn
+from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.config import CompilationConfig
@@ -15,9 +16,12 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
 from vllm.plugins import set_compilation_config
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
 
 
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
 def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     out: torch.Tensor) -> None:
     out.copy_(q)
@@ -25,12 +29,20 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     out += v
 
 
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-      out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
     return
 
 
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
 @dataclass
 class LlamaConfig:
     hidden_size: int = 128
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ffa05e80623ac..c294fcf7f08fe 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -14,7 +14,8 @@
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
 from vllm.forward_context import get_forward_context
-from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
+                        make_tensor_with_pad)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -595,8 +596,6 @@ def forward(
         return output
 
 
-@torch.library.custom_op("vllm::unified_flash_attention",
-                         mutates_args=["kv_cache"])
 def unified_flash_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -755,8 +754,7 @@ def unified_flash_attention(
     return output.view(num_tokens, hidden_size)
 
 
-@unified_flash_attention.register_fake
-def _(
+def unified_flash_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -773,3 +771,11 @@ def _(
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(query)
+
+
+direct_register_custom_op(
+    op_name="unified_flash_attention",
+    op_func=unified_flash_attention,
+    mutates_args=["kv_cache"],
+    fake_impl=unified_flash_attention_fake,
+)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 5ea101ae0432f..234c87d5c4edb 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -28,8 +28,8 @@
                                            is_block_tables_empty)
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.forward_context import get_forward_context
-from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
-                        make_tensor_with_pad)
+from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
+                        get_kv_cache_torch_dtype, make_tensor_with_pad)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -785,8 +785,6 @@ def forward(
         )
 
 
-@torch.library.custom_op("vllm::unified_flash_infer",
-                         mutates_args=["kv_cache"])
 def unified_flash_infer(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -906,8 +904,7 @@ def unified_flash_infer(
     return output.view(num_tokens, hidden_size)
 
 
-@unified_flash_infer.register_fake
-def _(
+def unified_flash_infer_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -924,3 +921,11 @@ def _(
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
+
+
+direct_register_custom_op(
+    op_name="unified_flash_infer",
+    op_func=unified_flash_infer,
+    mutates_args=["kv_cache"],
+    fake_impl=unified_flash_infer_fake,
+)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b04bbc478534c..94ba41a016f6d 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,7 +37,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import supports_custom_op
+from vllm.utils import direct_register_custom_op, supports_custom_op
 
 
 @dataclass
@@ -99,8 +99,6 @@ def _register_group(group: "GroupCoordinator") -> None:
 
 if supports_custom_op():
 
-    @torch.library.custom_op("vllm::inplace_all_reduce",
-                             mutates_args=["tensor"])
     def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
         assert group_name in _groups, f"Group {group_name} is not found."
         group = _groups[group_name]()
@@ -108,11 +106,16 @@ def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
             raise ValueError(f"Group {group_name} is destroyed.")
         group._all_reduce_in_place(tensor)
 
-    @inplace_all_reduce.register_fake
-    def _(tensor: torch.Tensor, group_name: str) -> None:
+    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
         return
 
-    @torch.library.custom_op("vllm::outplace_all_reduce", mutates_args=[])
+    direct_register_custom_op(
+        op_name="inplace_all_reduce",
+        op_func=inplace_all_reduce,
+        mutates_args=["tensor"],
+        fake_impl=inplace_all_reduce_fake,
+    )
+
     def outplace_all_reduce(tensor: torch.Tensor,
                             group_name: str) -> torch.Tensor:
         assert group_name in _groups, f"Group {group_name} is not found."
@@ -121,10 +124,17 @@ def outplace_all_reduce(tensor: torch.Tensor,
             raise ValueError(f"Group {group_name} is destroyed.")
         return group._all_reduce_out_place(tensor)
 
-    @outplace_all_reduce.register_fake
-    def _(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    def outplace_all_reduce_fake(tensor: torch.Tensor,
+                                 group_name: str) -> torch.Tensor:
         return torch.empty_like(tensor)
 
+    direct_register_custom_op(
+        op_name="outplace_all_reduce",
+        op_func=outplace_all_reduce,
+        mutates_args=[],
+        fake_impl=outplace_all_reduce_fake,
+    )
+
 
 class GroupCoordinator:
     """
@@ -338,6 +348,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         if self.world_size == 1:
             return input_
 
+        if input_.is_cpu:
+            import intel_extension_for_pytorch as ipex
+            ipex.distributed.all_reduce(input_, group=self.device_group)
+            return input_
+
         if not supports_custom_op():
             self._all_reduce_in_place(input_)
             return input_
@@ -369,9 +384,6 @@ def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
         pynccl_comm = self.pynccl_comm
         if (pynccl_comm is not None and not pynccl_comm.disabled):
             pynccl_comm.all_reduce(input_)
-        elif input_.is_cpu:
-            import intel_extension_for_pytorch as ipex
-            ipex.distributed.all_reduce(input_, group=self.device_group)
         else:
             torch.distributed.all_reduce(input_, group=self.device_group)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 93019d0d0abb6..4741d69de11ac 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -8,6 +8,7 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
+from vllm.utils import direct_register_custom_op
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -18,7 +19,6 @@ def get_scalar_type(num_bits: int, has_zp: bool):
         return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
 
 
-@torch.library.custom_op("vllm::single_marlin_moe", mutates_args=[])
 def single_marlin_moe(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
@@ -119,8 +119,7 @@ def single_marlin_moe(
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
 
-@single_marlin_moe.register_fake
-def _(
+def single_marlin_moe_fake(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
     scales: torch.Tensor,
@@ -136,7 +135,14 @@ def _(
     return torch.empty_like(hidden_states)
 
 
-@torch.library.custom_op("vllm::fused_marlin_moe", mutates_args=[])
+direct_register_custom_op(
+    op_name="single_marlin_moe",
+    op_func=single_marlin_moe,
+    mutates_args=[],
+    fake_impl=single_marlin_moe_fake,
+)
+
+
 def fused_marlin_moe(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -324,8 +330,7 @@ def fused_marlin_moe(
                      dim=1)
 
 
-@fused_marlin_moe.register_fake
-def _(
+def fused_marlin_moe_fake(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
@@ -344,3 +349,11 @@ def _(
     is_k_full: bool = True,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="fused_marlin_moe",
+    op_func=fused_marlin_moe,
+    mutates_args=[],
+    fake_impl=fused_marlin_moe_fake,
+)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1cf5c2253ca0b..340da32263c1c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -12,6 +12,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
@@ -466,8 +467,6 @@ def get_config_dtype_str(dtype: torch.dtype,
     return None
 
 
-@torch.library.custom_op("vllm::inplace_fused_experts",
-                         mutates_args=["hidden_states"])
 def inplace_fused_experts(hidden_states: torch.Tensor,
                           w1: torch.Tensor,
                           w2: torch.Tensor,
@@ -484,22 +483,29 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                        a1_scale, a2_scale)
 
 
-@inplace_fused_experts.register_fake
-def _(hidden_states: torch.Tensor,
-      w1: torch.Tensor,
-      w2: torch.Tensor,
-      topk_weights: torch.Tensor,
-      topk_ids: torch.Tensor,
-      use_fp8_w8a8: bool = False,
-      use_int8_w8a16: bool = False,
-      w1_scale: Optional[torch.Tensor] = None,
-      w2_scale: Optional[torch.Tensor] = None,
-      a1_scale: Optional[torch.Tensor] = None,
-      a2_scale: Optional[torch.Tensor] = None) -> None:
+def inplace_fused_experts_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None) -> None:
     pass
 
 
-@torch.library.custom_op("vllm::outplace_fused_experts", mutates_args=[])
+direct_register_custom_op(
+    op_name="inplace_fused_experts",
+    op_func=inplace_fused_experts,
+    mutates_args=["hidden_states"],
+    fake_impl=inplace_fused_experts_fake,
+)
+
+
 def outplace_fused_experts(
         hidden_states: torch.Tensor,
         w1: torch.Tensor,
@@ -517,21 +523,29 @@ def outplace_fused_experts(
                               w2_scale, a1_scale, a2_scale)
 
 
-@outplace_fused_experts.register_fake
-def _(hidden_states: torch.Tensor,
-      w1: torch.Tensor,
-      w2: torch.Tensor,
-      topk_weights: torch.Tensor,
-      topk_ids: torch.Tensor,
-      use_fp8_w8a8: bool = False,
-      use_int8_w8a16: bool = False,
-      w1_scale: Optional[torch.Tensor] = None,
-      w2_scale: Optional[torch.Tensor] = None,
-      a1_scale: Optional[torch.Tensor] = None,
-      a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+def outplace_fused_experts_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
+direct_register_custom_op(
+    op_name="outplace_fused_experts",
+    op_func=outplace_fused_experts,
+    mutates_args=[],
+    fake_impl=outplace_fused_experts_fake,
+)
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
diff --git a/vllm/utils.py b/vllm/utils.py
index 03cdbe6a0dc7b..5488719cc99b0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -32,6 +32,7 @@
 import torch.types
 import yaml
 from packaging.version import Version
+from torch.library import Library
 from typing_extensions import ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
@@ -1512,3 +1513,47 @@ def weak_ref_tensors(
     if isinstance(tensors, tuple):
         return tuple(weak_ref_tensor(t) for t in tensors)
     raise ValueError("Invalid type for tensors")
+
+
+def is_in_doc_build() -> bool:
+    try:
+        from sphinx.ext.autodoc.mock import _MockModule
+        return isinstance(torch, _MockModule)
+    except ModuleNotFoundError:
+        return False
+
+
+# create a library to hold the custom op
+vllm_lib = Library("vllm", "FRAGMENT")  # noqa
+
+
+def direct_register_custom_op(
+    op_name: str,
+    op_func: Callable,
+    mutates_args: List[str],
+    fake_impl: Optional[Callable] = None,
+    target_lib: Optional[Library] = None,
+):
+    """
+    `torch.library.custom_op` can have significant overhead because it
+    needs to consider complicated dispatching logic. This function
+    directly registers a custom op and dispatches it to the CUDA backend.
+    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+    for more details.
+
+    By default, the custom op is registered to the vLLM library. If you
+    want to register it to a different library, you can pass the library
+    object to the `target_lib` argument.
+
+    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
+    library object. If you want to bind the operator to a different library,
+    make sure the library object is alive when the operator is used.
+    """
+    if is_in_doc_build():
+        return
+    schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    my_lib = target_lib or vllm_lib
+    my_lib.define(op_name + schema_str)
+    my_lib.impl(op_name, op_func, "CUDA")
+    if fake_impl is not None:
+        my_lib._register_fake(op_name, fake_impl)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index ec07464e6a12a..b2af89ebf854a 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.forward_context import get_forward_context
+from vllm.utils import direct_register_custom_op
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 
@@ -152,8 +153,6 @@ def forward(
         return output
 
 
-@torch.library.custom_op("vllm::unified_flash_attention",
-                         mutates_args=["kv_cache"])
 def unified_flash_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -217,8 +216,7 @@ def unified_flash_attention(
     return output.view(num_tokens, hidden_size)
 
 
-@unified_flash_attention.register_fake
-def _(
+def unified_flash_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -235,3 +233,11 @@ def _(
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(query)
+
+
+direct_register_custom_op(
+    op_name="unified_flash_attention",
+    op_func=unified_flash_attention,
+    mutates_args=["kv_cache"],
+    fake_impl=unified_flash_attention_fake,
+)

From 37a4947dcd68c602d0911920e2c1a9168dea1ecb Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 1 Nov 2024 01:12:44 -0400
Subject: [PATCH 06/85] [Bugfix] Fix layer skip logic with bitsandbytes (#9887)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/bitsandbytes.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 7a039a78f09b8..718967a065192 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -119,7 +119,12 @@ def get_scaled_act_names(self) -> List[str]:
 
 
 def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]):
-    return any(module_name in prefix for module_name in llm_int8_skip_modules)
+    # Split the prefix into its dot-separated components
+    components = prefix.split('.')
+
+    # Check if any of the skip modules exactly matches any component
+    return any(module_name in components
+               for module_name in llm_int8_skip_modules)
 
 
 class BitsAndBytesLinearMethod(LinearMethodBase):

From 566cd277979bc1a46b7e99657112416af9874a58 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 31 Oct 2024 22:20:17 -0700
Subject: [PATCH 07/85] [torch.compile] rework test plans (#9866)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/test_basic_correctness.py | 113 +++++++++++++++++----
 tests/utils.py                          | 124 +++++++++++++++++++++++-
 vllm/model_executor/models/llava.py     |  10 +-
 vllm/model_executor/models/phi3v.py     |  10 +-
 4 files changed, 226 insertions(+), 31 deletions(-)

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 2f92ff73845f5..833589ba5dc9f 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,3 +1,4 @@
+import dataclasses
 from typing import Dict, List, Optional
 
 import pytest
@@ -8,33 +9,109 @@
 from ..utils import compare_all_settings
 
 
+@dataclasses.dataclass
+class TestSetting:
+    model: str
+    model_args: List[str]
+    pp_size: int
+    tp_size: int
+    attn_backend: str
+    method: str
+    fullgraph: bool
+
+
+# representative settings for testing
+test_settings = [
+    # basic llama model
+    TestSetting(
+        model="meta-llama/Llama-3.2-1B",
+        model_args=[],
+        pp_size=2,
+        tp_size=2,
+        attn_backend="FLASHINFER",
+        method="generate",
+        fullgraph=True,
+    ),
+    # llama model with quantization
+    TestSetting(
+        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+        model_args=["--quantization", "gptq"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # MoE model
+    TestSetting(
+        model="ibm/PowerMoE-3b",
+        model_args=[],
+        pp_size=1,
+        tp_size=2,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # embedding model
+    TestSetting(
+        model="BAAI/bge-multilingual-gemma2",
+        model_args=["--task", "embedding"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASHINFER",
+        method="encode",
+        fullgraph=True,
+    ),
+    # vision language model
+    TestSetting(
+        model="microsoft/Phi-3.5-vision-instruct",
+        model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        pp_size=2,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate_with_image",
+        fullgraph=False,
+    ),
+]
+
+
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize(
-    "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
-    [
-        ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True),
-        ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
-         ["--quantization", "compressed-tensors"
-          ], 1, 1, "FLASH_ATTN", "generate", True),
-        ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True),
-        # TODO: add multi-modality test for llava
-        ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False)
-    ])
-def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
-                             method, fullgraph):
+@pytest.mark.parametrize("test_setting", test_settings)
+def test_compile_correctness(test_setting: TestSetting):
     # this test is run under multiple suits, with different GPUs.
     # make sure we only run the test with correct CUDA devices.
     # don't use "<", as it will duplicate the tests.
+    model = test_setting.model
+    model_args = test_setting.model_args
+    pp_size = test_setting.pp_size
+    tp_size = test_setting.tp_size
+    attn_backend = test_setting.attn_backend
+    method = test_setting.method
+    fullgraph = test_setting.fullgraph
     if cuda_device_count_stateless() != pp_size * tp_size:
         pytest.skip("Not correct CUDA devices for the test.")
     import os
     os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
-    all_args = [["--enforce-eager"] + model_args + ["-pp", str(pp_size)] +
-                ["-tp", str(tp_size)]] * 3
-    # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
-    # inductor will change the output, so we cannot compare them.
+    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
+                ["-tp", str(tp_size)]
+
     all_envs: List[Optional[Dict[str, str]]] = []
+
+    for level in [
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.PIECEWISE,
+    ]:
+        all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+
+    # inductor will change the output, so we only compare if the output
+    # is close, not exactly the same.
+    compare_all_settings(
+        model, [final_args] * 2,
+        all_envs,
+        method=method if method != "generate" else "generate_close")
+    all_envs.clear()
+
     for level in [
             CompilationLevel.NO_COMPILATION,
             CompilationLevel.DYNAMO_AS_IS,
@@ -46,4 +123,4 @@ def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
             all_envs[-1][
                 "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
 
-    compare_all_settings(model, all_args, all_envs, method=method)
+    compare_all_settings(model, [final_args] * 3, all_envs, method=method)
diff --git a/tests/utils.py b/tests/utils.py
index e8aad9cb3268f..16e21f68c7c96 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import functools
 import os
 import signal
@@ -8,13 +9,14 @@
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import openai
 import pytest
 import requests
+import torch
 from openai.types.completion import Completion
-from typing_extensions import ParamSpec, assert_never
+from typing_extensions import ParamSpec
 
 import vllm.envs as envs
 from tests.models.utils import TextTextLogprobs
@@ -272,6 +274,31 @@ def _test_completion(
     return results
 
 
+def _test_completion_close(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    # test with text prompt
+    completion = client.completions.create(model=model,
+                                           prompt=prompt,
+                                           max_tokens=1,
+                                           logprobs=5,
+                                           temperature=0.0)
+
+    logporbs = completion.choices[0].logprobs.top_logprobs[0]
+    logporbs = {k: round(v, 2) for k, v in logporbs.items()}
+
+    results.append({
+        "test": "completion_close",
+        "logprobs": logporbs,
+    })
+
+    return results
+
+
 def _test_embeddings(
     client: openai.OpenAI,
     model: str,
@@ -295,13 +322,81 @@ def _test_embeddings(
     return results
 
 
+def _test_image_text(
+    client: openai.OpenAI,
+    model_name: str,
+    image_url: str,
+):
+    results = []
+
+    # test pure text input
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "How do you feel today?"
+            },
+        ],
+    }]
+
+    chat_completion = client.chat.completions.create(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.0,
+                                                     max_tokens=1,
+                                                     logprobs=True,
+                                                     top_logprobs=5)
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    for x in top_logprobs:
+        x.logprob = round(x.logprob, 2)
+
+    results.append({
+        "test": "pure_text",
+        "logprobs": top_logprobs,
+    })
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    chat_completion = client.chat.completions.create(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.0,
+                                                     max_tokens=1,
+                                                     logprobs=True,
+                                                     top_logprobs=5)
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    results.append({
+        "test": "text_image",
+        "logprobs": top_logprobs,
+    })
+
+    return results
+
+
 def compare_two_settings(model: str,
                          arg1: List[str],
                          arg2: List[str],
                          env1: Optional[Dict[str, str]] = None,
                          env2: Optional[Dict[str, str]] = None,
                          *,
-                         method: Literal["generate", "encode"] = "generate",
+                         method: str = "generate",
                          max_wait_seconds: Optional[float] = None) -> None:
     """
     Launch API server with two different sets of arguments/environments
@@ -328,7 +423,7 @@ def compare_all_settings(model: str,
                          all_args: List[List[str]],
                          all_envs: List[Optional[Dict[str, str]]],
                          *,
-                         method: Literal["generate", "encode"] = "generate",
+                         method: str = "generate",
                          max_wait_seconds: Optional[float] = None) -> None:
     """
     Launch API server with several different sets of arguments/environments
@@ -397,10 +492,17 @@ def compare_all_settings(model: str,
 
             if method == "generate":
                 results += _test_completion(client, model, prompt, token_ids)
+            elif method == "generate_close":
+                results += _test_completion_close(client, model, prompt)
+            elif method == "generate_with_image":
+                results += _test_image_text(
+                    client, model,
+                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
+                )
             elif method == "encode":
                 results += _test_embeddings(client, model, prompt)
             else:
-                assert_never(method)
+                raise ValueError(f"Unknown method: {method}")
 
             if i > 0:
                 # if any setting fails, raise an error early
@@ -410,6 +512,18 @@ def compare_all_settings(model: str,
                 compare_envs = all_envs[i]
                 for ref_result, compare_result in zip(ref_results,
                                                       compare_results):
+                    ref_result = copy.deepcopy(ref_result)
+                    compare_result = copy.deepcopy(compare_result)
+                    if "embedding" in ref_result and method == "encode":
+                        ref_embedding = torch.tensor(ref_result["embedding"])
+                        compare_embedding = torch.tensor(
+                            compare_result["embedding"])
+                        mse = ((ref_embedding - compare_embedding)**2).mean()
+                        assert mse < 1e-6, (
+                            f"Embedding for {model=} are not the same.\n"
+                            f"mse={mse}\n")
+                        del ref_result["embedding"]
+                        del compare_result["embedding"]
                     assert ref_result == compare_result, (
                         f"Results for {model=} are not the same.\n"
                         f"{ref_args=} {ref_envs=}\n"
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index eda99c029881f..27055e7ced865 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -493,13 +493,9 @@ def forward(
             :class:`LlavaImageInputs`
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
             image_input = self._parse_and_validate_image_input(**kwargs)
-
             if image_input is not None:
                 vision_embeddings = self._process_image_input(image_input)
                 inputs_embeds = self.language_model.model.get_input_embeddings(
@@ -511,7 +507,11 @@ def forward(
             else:
                 inputs_embeds = self.language_model.model.get_input_embeddings(
                     input_ids)
-            input_ids = None
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fc4556831fd7..4928e447d5b9e 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -679,7 +679,6 @@ def forward(self,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 **kwargs: object):
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
             image_input = self._parse_and_validate_image_input(**kwargs)
@@ -690,9 +689,14 @@ def forward(self,
                 inputs_embeds = merge_multimodal_embeddings(
                     input_ids, inputs_embeds, vision_embeddings,
                     self.image_token_id)
-                input_ids = None
             else:
-                inputs_embeds = None
+                inputs_embeds = self.language_model.model.embed_tokens(
+                    input_ids)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,

From 93a76dd21dcec8977f1ffd0e21faa88fb515b9e4 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 1 Nov 2024 01:31:56 -0400
Subject: [PATCH 08/85] [Model] Support bitsandbytes for MiniCPMV (#9891)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/models/minicpmv.py | 43 ++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a270282d87bc8..4917c33136069 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -810,6 +810,28 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         # resampler
         "kv_proj",
     ]
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     embedding_modules = {}
     embedding_padding_modules = []
 
@@ -931,6 +953,27 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         "kv_proj",
     ]
 
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     embedding_modules = {}
     embedding_padding_modules = []
 

From 2b5bf20988edaab21621b78a9eb589edc93f2763 Mon Sep 17 00:00:00 2001
From: Yongzao <532741407@qq.com>
Date: Fri, 1 Nov 2024 15:25:47 +0800
Subject: [PATCH 09/85] [torch.compile] Adding torch compile annotations to
 some models (#9876)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/models/supported_models.rst     | 2 +-
 tests/distributed/test_pipeline_parallel.py | 2 +-
 vllm/model_executor/models/falcon.py        | 2 ++
 vllm/model_executor/models/phi.py           | 2 ++
 vllm/model_executor/models/qwen.py          | 2 ++
 vllm/model_executor/models/qwen2.py         | 2 ++
 vllm/model_executor/models/qwen2_moe.py     | 2 ++
 7 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 3279e7a108232..e493cebf1e9f4 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -281,7 +281,7 @@ Text Generation
     - ✅︎
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
-    - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
+    - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
     - ✅︎
     - ✅︎
   * - :code:`Qwen2MoeForCausalLM`
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index ed6360f9d6148..1489a60891761 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -166,7 +166,7 @@ def iter_params(self, model_name: str):
     "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
     "adept/persimmon-8b-chat": PPTestSettings.fast(),
     "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
-    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
+    "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
     "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
     "bigcode/starcoder2-3b": PPTestSettings.fast(),
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 467a33505ee12..36c85e37783ab 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -27,6 +27,7 @@
 from transformers import FalconConfig as HF_FalconConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -329,6 +330,7 @@ def forward(
         return output
 
 
+@support_torch_compile
 class FalconModel(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index ec20cb249ba9b..497eae4e8905b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -42,6 +42,7 @@
 from transformers import PhiConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
@@ -193,6 +194,7 @@ def forward(
         return hidden_states
 
 
+@support_torch_compile
 class PhiModel(nn.Module):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 998016ea28c26..61665768eacf5 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -20,6 +20,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
@@ -549,6 +550,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
 class QWenModel(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index db1029345a8ac..db7556b3b5f4b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -29,6 +29,7 @@
 from transformers import Qwen2Config
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -237,6 +238,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
 class Qwen2Model(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index d4475b7ca27af..dac85e35d369d 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -30,6 +30,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
@@ -312,6 +313,7 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
 class Qwen2MoeModel(nn.Module):
 
     def __init__(

From d3aa2a8b2f93f50ed40fe7d8617701a2294a13e4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 15:34:49 +0800
Subject: [PATCH 10/85] [Doc] Update multi-input support (#9906)

---
 docs/source/models/supported_models.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index e493cebf1e9f4..80714a90df5c2 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -466,7 +466,7 @@ Text Generation
     - ✅︎
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
-    - T + I\ :sup:`+` + V
+    - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
@@ -478,7 +478,7 @@ Text Generation
     - ✅︎
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
-    - T + I
+    - T + I\ :sup:`+`
     - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
     -
     -

From 06386a64dd706cf3fdab82510124ca2c2f9eee9d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 16:13:35 +0800
Subject: [PATCH 11/85] [Frontend] Chat-based Embeddings API (#9759)

---
 docs/requirements-docs.txt                    |   2 +
 docs/source/conf.py                           |   2 +-
 docs/source/dev/pooling_params.rst            |   5 +
 docs/source/getting_started/quickstart.rst    |   8 +-
 docs/source/index.rst                         |   1 +
 docs/source/models/vlm.rst                    |  54 ++++-
 .../serving/openai_compatible_server.md       |  55 ++++-
 tests/entrypoints/openai/test_basic.py        |  13 +-
 tests/entrypoints/openai/test_embedding.py    | 137 +++++++----
 tests/entrypoints/openai/test_metrics.py      |  14 +-
 tests/entrypoints/openai/test_tokenization.py |  32 +--
 .../openai/test_vision_embedding.py           |  94 ++++++++
 vllm/entrypoints/openai/api_server.py         |  96 +++++---
 vllm/entrypoints/openai/protocol.py           |  87 ++++++-
 vllm/entrypoints/openai/run_batch.py          |  34 ++-
 vllm/entrypoints/openai/serving_chat.py       | 222 +++++++-----------
 vllm/entrypoints/openai/serving_completion.py |  75 +++---
 vllm/entrypoints/openai/serving_embedding.py  |  87 ++++---
 vllm/entrypoints/openai/serving_engine.py     | 159 ++++++++++++-
 .../openai/serving_tokenization.py            |  87 +++----
 vllm/pooling_params.py                        |   4 +-
 21 files changed, 853 insertions(+), 415 deletions(-)
 create mode 100644 docs/source/dev/pooling_params.rst
 create mode 100644 tests/entrypoints/openai/test_vision_embedding.py

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index d58f226136918..e3e35844405ac 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -13,5 +13,7 @@ torch
 py-cpuinfo
 transformers
 mistral_common >= 1.3.4
+aiohttp
+starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8435129e752e1..c7b638473a931 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -96,7 +96,6 @@ def setup(app):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
-    "aiohttp",
     "compressed_tensors",
     "cpuinfo",
     "cv2",
@@ -143,6 +142,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
     "python": ("https://docs.python.org/3", None),
     "typing_extensions":
     ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
     "pillow": ("https://pillow.readthedocs.io/en/stable", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "torch": ("https://pytorch.org/docs/stable", None),
diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.rst
new file mode 100644
index 0000000000000..334e0287aff09
--- /dev/null
+++ b/docs/source/dev/pooling_params.rst
@@ -0,0 +1,5 @@
+Pooling Parameters
+==================
+
+.. autoclass:: vllm.PoolingParams
+    :members:
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index f0e6cddf09ef7..00b762ccc2ccb 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -138,10 +138,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
 
 A more detailed client example can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`__.
 
-OpenAI Chat API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAI Chat Completions API with vLLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
 
 You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to interact with the model:
 
@@ -157,7 +157,7 @@ You can use the `create chat completion <https://platform.openai.com/docs/api-re
     $         ]
     $     }'
 
-Alternatively, you can use the `openai` python package:
+Alternatively, you can use the ``openai`` python package:
 
 .. code-block:: python
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c328c049b430c..2399fcf5faec9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -134,6 +134,7 @@ Documentation
    :caption: Developer Documentation
 
    dev/sampling_params
+   dev/pooling_params
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index a47902ab4fc9d..ac6405b9807a8 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -185,7 +185,7 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
       --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
 .. important::
-    Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
+    Since OpenAI Vision API is based on `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_,
     a chat template is **required** to launch the API server.
 
     Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
@@ -243,6 +243,10 @@ To consume the server, you can use the OpenAI client like in the example below:
 
 A full code example can be found in `examples/openai_api_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_api_client_for_multimodal.py>`_.
 
+.. tip::
+    There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+    In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
+
 .. note::
 
     By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
@@ -251,5 +255,49 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p
 
         $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
-.. note::
-    There is no need to format the prompt in the API request since it will be handled by the server.
+Chat Embeddings API
+^^^^^^^^^^^^^^^^^^^
+
+vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
+where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
+
+.. tip::
+    The schema of ``messages`` is exactly the same as in Chat Completions API.
+
+In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
+
+.. code-block:: bash
+
+    vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+      --trust-remote-code --max-model-len 4096
+
+.. important::
+
+    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
+    to run this model in embedding mode instead of text generation mode.
+
+Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+
+.. code-block:: python
+
+    import requests
+
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "TIGER-Lab/VLM2Vec-Full",
+            "messages": [{
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Represent the given image."},
+                ],
+            }],
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+    print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a1f93a9a28578..0b5f75caf2475 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -26,13 +26,26 @@ print(completion.choices[0].message)
 ```
 
 ## API Reference
-Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except:
-- Chat: `tools`, and `tool_choice`.
-- Completions: `suffix`.
 
-vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
+We currently support the following OpenAI APIs:
+
+- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+  - *Note: `suffix` parameter is not supported.*
+- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst).
+    - *Note: `image_url.detail` parameter is not supported.*
+  - We also support `audio_url` content type for audio files.
+    - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
+    - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
+  - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
+  - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
+    which will be treated as a single prompt to the model according to its chat template.
+    - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst).
+  - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
 
 ## Extra Parameters
+
 vLLM supports a set of parameters that are not part of the OpenAI API.
 In order to use them, you can pass them as extra parameters in the OpenAI client.
 Or directly merge them into the JSON payload if you are using HTTP call directly.
@@ -49,7 +62,26 @@ completion = client.chat.completions.create(
 )
 ```
 
-### Extra Parameters for Chat API
+### Extra Parameters for Completions API
+
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-sampling-params
+:end-before: end-completion-sampling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-extra-params
+:end-before: end-completion-extra-params
+```
+
+### Extra Parameters for Chat Completions API
+
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
@@ -66,21 +98,22 @@ The following extra parameters are supported:
 :end-before: end-chat-completion-extra-params
 ```
 
-### Extra Parameters for Completions API
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+### Extra Parameters for Embeddings API
+
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
-:start-after: begin-completion-sampling-params
-:end-before: end-completion-sampling-params
+:start-after: begin-embedding-pooling-params
+:end-before: end-embedding-pooling-params
 ```
 
 The following extra parameters are supported:
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
-:start-after: begin-completion-extra-params
-:end-before: end-completion-extra-params
+:start-after: begin-embedding-extra-params
+:end-before: end-embedding-extra-params
 ```
 
 ## Chat Template
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index d3aea533b6db9..4616f363cc04a 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,7 +1,6 @@
 from http import HTTPStatus
 from typing import List
 
-import openai
 import pytest
 import pytest_asyncio
 import requests
@@ -83,10 +82,8 @@ async def client(server):
     indirect=True,
 )
 @pytest.mark.asyncio
-async def test_show_version(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
-    response = requests.get(base_url + "/version")
+async def test_show_version(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("version"))
     response.raise_for_status()
 
     assert response.json() == {"version": VLLM_VERSION}
@@ -102,9 +99,7 @@ async def test_show_version(client: openai.AsyncOpenAI):
     indirect=True,
 )
 @pytest.mark.asyncio
-async def test_check_health(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
-    response = requests.get(base_url + "/health")
+async def test_check_health(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
 
     assert response.status_code == HTTPStatus.OK
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f119c6c1201c9..9f2b77dde2a7f 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -4,14 +4,18 @@
 import openai
 import pytest
 import pytest_asyncio
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 
-EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 
 
 @pytest.fixture(scope="module")
-def embedding_server():
+def server():
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -19,31 +23,29 @@ def embedding_server():
         "--enforce-eager",
         "--max-model-len",
         "8192",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
     ]
 
-    with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
 @pytest_asyncio.fixture
-async def embedding_client(embedding_server):
-    async with embedding_server.get_async_client() as async_client:
+async def client(server):
+    async with server.get_async_client() as async_client:
         yield async_client
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
-                                model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
 
     # test single embedding
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
@@ -57,7 +59,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
 
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
@@ -71,18 +73,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
-                               model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
     # test List[str]
     input_texts = [
         "The cat sat on the mat.", "A feline was resting on a rug.",
         "Stars twinkle brightly in the night sky."
     ]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
@@ -90,11 +88,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
     assert embeddings.id is not None
     assert len(embeddings.data) == 3
     assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 32
+    assert embeddings.usage.total_tokens == 32
 
     # test List[List[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
@@ -108,22 +109,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_embedding(server: RemoteOpenAIServer,
+                                      client: openai.AsyncOpenAI,
+                                      model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(server.url_for("v1/embeddings"),
+                                  json={
+                                      "model": model_name,
+                                      "messages": messages,
+                                      "encoding_format": "float",
+                                  })
+    chat_response.raise_for_status()
+    chat_embeddings = chat_response.json()
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completion_response = await client.embeddings.create(
+        model=model_name,
+        input=prompt,
+        encoding_format="float",
+        # To be consistent with chat
+        extra_body={"add_special_tokens": False},
+    )
+    completion_embeddings = completion_response.model_dump(mode="json")
+
+    assert chat_embeddings.pop("id") is not None
+    assert completion_embeddings.pop("id") is not None
+    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
+        "created")
+    assert chat_embeddings == completion_embeddings
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
                                       model_name: str):
     input_texts = [
         "Hello my name is",
         "The best thing about vLLM is that it supports many different models"
     ]
 
-    responses_float = await embedding_client.embeddings.create(
-        input=input_texts, model=model_name, encoding_format="float")
+    responses_float = await client.embeddings.create(input=input_texts,
+                                                     model=model_name,
+                                                     encoding_format="float")
 
-    responses_base64 = await embedding_client.embeddings.create(
-        input=input_texts, model=model_name, encoding_format="base64")
+    responses_base64 = await client.embeddings.create(input=input_texts,
+                                                      model=model_name,
+                                                      encoding_format="base64")
 
     decoded_responses_base64_data = []
     for data in responses_base64.data:
@@ -137,8 +186,8 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
         1]
 
     # Default response is float32 decoded from base64 by OpenAI Client
-    responses_default = await embedding_client.embeddings.create(
-        input=input_texts, model=model_name)
+    responses_default = await client.embeddings.create(input=input_texts,
+                                                       model=model_name)
 
     assert responses_float.data[0].embedding == responses_default.data[
         0].embedding
@@ -147,18 +196,15 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding_truncation(
-        embedding_client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
+                                           model_name: str):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
 
     # test single embedding
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         extra_body={"truncate_prompt_tokens": 10})
@@ -173,7 +219,7 @@ async def test_single_embedding_truncation(
         1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
         9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
     ]
-    embeddings = await embedding_client.embeddings.create(
+    embeddings = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         extra_body={"truncate_prompt_tokens": 10})
@@ -187,18 +233,15 @@ async def test_single_embedding_truncation(
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding_truncation_invalid(
-        embedding_client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
+                                                   model_name: str):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
 
     with pytest.raises(openai.BadRequestError):
-        embeddings = await embedding_client.embeddings.create(
+        embeddings = await client.embeddings.create(
             model=model_name,
             input=input_texts,
             extra_body={"truncate_prompt_tokens": 8193})
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 6cb74eb78cbf0..b3f1fea91d13e 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -79,9 +79,8 @@ async def client(server):
 
 
 @pytest.mark.asyncio
-async def test_metrics_counts(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
+async def test_metrics_counts(server: RemoteOpenAIServer,
+                              client: openai.AsyncClient):
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -89,7 +88,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
             prompt=_TOKENIZED_PROMPT,
             max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
 
-    response = requests.get(base_url + "/metrics")
+    response = requests.get(server.url_for("metrics"))
     print(response.text)
     assert response.status_code == HTTPStatus.OK
 
@@ -170,16 +169,15 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_metrics_exist(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-
+async def test_metrics_exist(server: RemoteOpenAIServer,
+                             client: openai.AsyncClient):
     # sending a request triggers the metrics to be logged.
     await client.completions.create(model=MODEL_NAME,
                                     prompt="Hello, my name is",
                                     max_tokens=5,
                                     temperature=0.0)
 
-    response = requests.get(base_url + "/metrics")
+    response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
     for metric in EXPECTED_METRICS:
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 859a676a9c777..b1956a8cbc9dc 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,4 +1,3 @@
-import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import requests
@@ -55,9 +54,11 @@ async def client(server):
     [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
     indirect=["tokenizer_name"],
 )
-async def test_tokenize_completions(client: openai.AsyncOpenAI,
-                                    model_name: str, tokenizer_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
+async def test_tokenize_completions(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
     tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                               tokenizer_mode="fast")
 
@@ -65,7 +66,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
         prompt = "vllm1 This is a test prompt."
         tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
 
-        response = requests.post(base_url + "/tokenize",
+        response = requests.post(server.url_for("tokenize"),
                                  json={
                                      "add_special_tokens": add_special,
                                      "model": model_name,
@@ -86,9 +87,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
     [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
     indirect=["tokenizer_name"],
 )
-async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
-                             tokenizer_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
+async def test_tokenize_chat(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
     tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                               tokenizer_mode="fast")
 
@@ -121,7 +124,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
                 tokens = tokenizer.encode(prompt,
                                           add_special_tokens=add_special)
 
-                response = requests.post(base_url + "/tokenize",
+                response = requests.post(server.url_for("tokenize"),
                                          json={
                                              "add_generation_prompt":
                                              add_generation,
@@ -146,17 +149,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
     [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
     indirect=["tokenizer_name"],
 )
-async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
-                          tokenizer_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
+async def test_detokenize(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
     tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                               tokenizer_mode="fast")
 
     prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
 
-    print(f"CALLING {base_url} FOR {model_name}")
-    response = requests.post(base_url + "/detokenize",
+    response = requests.post(server.url_for("detokenize"),
                              json={
                                  "model": model_name,
                                  "tokens": tokens
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
new file mode 100644
index 0000000000000..73a69da32e434
--- /dev/null
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -0,0 +1,94 @@
+from typing import Dict
+
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
+MAXIMUM_IMAGES = 2
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embedding",
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> Dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
+                               image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "Represent the given image."
+            },
+        ],
+    }]
+
+    response = requests.post(server.url_for("v1/embeddings"),
+                             json={
+                                 "model": model_name,
+                                 "messages": messages,
+                                 "encoding_format": "float"
+                             })
+    response.raise_for_status()
+
+    embeddings = response.json()
+    assert embeddings["id"] is not None
+    assert len(embeddings["data"]) == 1
+    assert len(embeddings["data"][0]["embedding"]) == 3072
+    assert embeddings["usage"]["completion_tokens"] == 0
+    assert embeddings["usage"]["prompt_tokens"] == 771
+    assert embeddings["usage"]["total_tokens"] == 771
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 46c92e10b360c..95fd56d916050 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -11,7 +11,7 @@
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import AsyncIterator, Set
+from typing import AsyncIterator, Optional, Set
 
 import uvloop
 from fastapi import APIRouter, FastAPI, Request
@@ -51,7 +51,7 @@
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -248,20 +248,25 @@ def mount_metrics(app: FastAPI):
     app.routes.append(metrics_route)
 
 
-def chat(request: Request) -> OpenAIServingChat:
+def base(request: Request) -> OpenAIServing:
+    # Reuse the existing instance
+    return tokenization(request)
+
+
+def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
 
 
-def completion(request: Request) -> OpenAIServingCompletion:
+def completion(request: Request) -> Optional[OpenAIServingCompletion]:
     return request.app.state.openai_serving_completion
 
 
-def tokenization(request: Request) -> OpenAIServingTokenization:
-    return request.app.state.openai_serving_tokenization
+def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
+    return request.app.state.openai_serving_embedding
 
 
-def embedding(request: Request) -> OpenAIServingEmbedding:
-    return request.app.state.openai_serving_embedding
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
 
 
 def engine_client(request: Request) -> EngineClient:
@@ -277,7 +282,9 @@ async def health(raw_request: Request) -> Response:
 
 @router.post("/tokenize")
 async def tokenize(request: TokenizeRequest, raw_request: Request):
-    generator = await tokenization(raw_request).create_tokenize(request)
+    handler = tokenization(raw_request)
+
+    generator = await handler.create_tokenize(request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -289,7 +296,9 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
 @router.post("/detokenize")
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
-    generator = await tokenization(raw_request).create_detokenize(request)
+    handler = tokenization(raw_request)
+
+    generator = await handler.create_detokenize(request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -301,7 +310,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
-    models = await completion(raw_request).show_available_models()
+    handler = base(raw_request)
+
+    models = await handler.show_available_models()
     return JSONResponse(content=models.model_dump())
 
 
@@ -314,9 +325,12 @@ async def show_version():
 @router.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
+    handler = chat(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Chat Completions API")
 
-    generator = await chat(raw_request).create_chat_completion(
-        request, raw_request)
+    generator = await handler.create_chat_completion(request, raw_request)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
@@ -330,8 +344,12 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 @router.post("/v1/completions")
 async def create_completion(request: CompletionRequest, raw_request: Request):
-    generator = await completion(raw_request).create_completion(
-        request, raw_request)
+    handler = completion(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Completions API")
+
+    generator = await handler.create_completion(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -343,8 +361,12 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
 @router.post("/v1/embeddings")
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
-    generator = await embedding(raw_request).create_embedding(
-        request, raw_request)
+    handler = embedding(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Embeddings API")
+
+    generator = await handler.create_embedding(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -382,30 +404,26 @@ async def stop_profile(raw_request: Request):
     @router.post("/v1/load_lora_adapter")
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
-        response = await chat(raw_request).load_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
-
-        response = await completion(raw_request).load_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
+        for route in [chat, completion, embedding]:
+            handler = route(raw_request)
+            if handler is not None:
+                response = await handler.load_lora_adapter(request)
+                if isinstance(response, ErrorResponse):
+                    return JSONResponse(content=response.model_dump(),
+                                        status_code=response.code)
 
         return Response(status_code=200, content=response)
 
     @router.post("/v1/unload_lora_adapter")
     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
                                   raw_request: Request):
-        response = await chat(raw_request).unload_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
-
-        response = await completion(raw_request).unload_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
+        for route in [chat, completion, embedding]:
+            handler = route(raw_request)
+            if handler is not None:
+                response = await handler.unload_lora_adapter(request)
+                if isinstance(response, ErrorResponse):
+                    return JSONResponse(content=response.model_dump(),
+                                        status_code=response.code)
 
         return Response(status_code=200, content=response)
 
@@ -501,7 +519,8 @@ def init_app_state(
         chat_template=args.chat_template,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
-        tool_parser=args.tool_call_parser)
+        tool_parser=args.tool_call_parser,
+    ) if model_config.task == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
@@ -510,13 +529,14 @@ def init_app_state(
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-    )
+    ) if model_config.task == "generate" else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
         base_model_paths,
         request_logger=request_logger,
-    )
+        chat_template=args.chat_template,
+    ) if model_config.task == "embedding" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 60fc5ac8d11d2..1335e51bd152c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -708,7 +708,7 @@ def validate_stream_options(cls, data):
         return data
 
 
-class EmbeddingRequest(OpenAIBaseModel):
+class EmbeddingCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/embeddings
     model: str
@@ -720,10 +720,15 @@ class EmbeddingRequest(OpenAIBaseModel):
 
     # doc: begin-embedding-pooling-params
     additional_data: Optional[Any] = None
-
     # doc: end-embedding-pooling-params
 
     # doc: begin-embedding-extra-params
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
+    )
     priority: int = Field(
         default=0,
         description=(
@@ -737,6 +742,82 @@ def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
 
 
+class EmbeddingChatRequest(OpenAIBaseModel):
+    model: str
+    messages: List[ChatCompletionMessageParam]
+
+    encoding_format: Literal["float", "base64"] = "float"
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+    # doc: begin-chat-embedding-pooling-params
+    additional_data: Optional[Any] = None
+    # doc: end-chat-embedding-pooling-params
+
+    # doc: begin-chat-embedding-extra-params
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=
+        ("If this is set, the chat will be formatted so that the final "
+         "message in the chat is open-ended, without any EOS tokens. The "
+         "model will continue this message rather than starting a new one. "
+         "This allows you to \"prefill\" part of the model's response for it. "
+         "Cannot be used at the same time as `add_generation_prompt`."),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."))
+    # doc: end-chat-embedding-extra-params
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get(
+                "add_generation_prompt"):
+            raise ValueError("Cannot set both `continue_final_message` and "
+                             "`add_generation_prompt` to True.")
+        return data
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
+
+EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
+
+
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -799,7 +880,7 @@ class EmbeddingResponseData(OpenAIBaseModel):
 
 
 class EmbeddingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
     object: str = "list"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index f5249a0c447b3..a64467a311523 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -217,13 +217,14 @@ async def main(args):
         prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
-    )
+    ) if model_config.task == "generate" else None
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
         base_model_paths,
         request_logger=request_logger,
-    )
+        chat_template=None,
+    ) if model_config.task == "embedding" else None
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
@@ -240,14 +241,31 @@ async def main(args):
 
         # Determine the type of request and run it.
         if request.url == "/v1/chat/completions":
-            response_futures.append(
-                run_request(openai_serving_chat.create_chat_completion,
-                            request, tracker))
+            handler_fn = (None if openai_serving_chat is None else
+                          openai_serving_chat.create_chat_completion)
+            if handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg=
+                        "The model does not support Chat Completions API",
+                    ))
+                continue
+
+            response_futures.append(run_request(handler_fn, request, tracker))
             tracker.submitted()
         elif request.url == "/v1/embeddings":
-            response_futures.append(
-                run_request(openai_serving_embedding.create_embedding, request,
-                            tracker))
+            handler_fn = (None if openai_serving_embedding is None else
+                          openai_serving_embedding.create_embedding)
+            if handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Embeddings API",
+                    ))
+                continue
+
+            response_futures.append(run_request(handler_fn, request, tracker))
             tracker.submitted()
         else:
             response_futures.append(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1f951d15a7a32..9551b4f2091dd 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -10,11 +10,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (ConversationMessage,
-                                         apply_hf_chat_template,
-                                         apply_mistral_chat_template,
-                                         load_chat_template,
-                                         parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -27,16 +23,12 @@
 from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
                                                     LoRAModulePath,
                                                     OpenAIServing,
-                                                    PromptAdapterPath,
-                                                    TextTokensPrompt)
+                                                    PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
-from vllm.tracing import (contains_trace_headers, extract_trace_headers,
-                          log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import iterate_with_cancellation
 
@@ -94,12 +86,12 @@ async def create_chat_completion(
         raw_request: Optional[Request] = None,
     ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
                ErrorResponse]:
-        """Completion API similar to OpenAI's API.
+        """
+        Chat Completion API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/chat/create
         for the API specification. This API mimics the OpenAI
-        ChatCompletion API.
-
+        Chat Completion API.
         """
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
@@ -118,143 +110,106 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            model_config = self.model_config
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
-            conversation, mm_data_future = parse_chat_messages_futures(
-                request.messages, model_config, tokenizer)
+            tool_parser = self.tool_parser
+
+            # validation for OpenAI tools
+            # tool_choice = "required" is not supported
+            if request.tool_choice == "required":
+                return self.create_error_response(
+                    "tool_choice = \"required\" is not supported!")
+
+            if (request.tool_choice == "auto" and
+                    not (self.enable_auto_tools and tool_parser is not None)
+                    and not isinstance(tokenizer, MistralTokenizer)):
+                # for hf tokenizers, "auto" tools requires
+                # --enable-auto-tool-choice and --tool-call-parser
+                return self.create_error_response(
+                    "\"auto\" tool choice requires "
+                    "--enable-auto-tool-choice and --tool-call-parser to be set"
+                )
 
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
             ]
 
-            prompt: Union[str, List[int]]
-            is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
-            if is_mistral_tokenizer:
-                prompt = apply_mistral_chat_template(
-                    tokenizer,
-                    messages=request.messages,
-                    chat_template=request.chat_template or self.chat_template,
-                    add_generation_prompt=request.add_generation_prompt,
-                    continue_final_message=request.continue_final_message,
-                    tools=tool_dicts,
-                    documents=request.documents,
-                    **(request.chat_template_kwargs or {}),
-                )
-            else:
-                prompt = apply_hf_chat_template(
-                    tokenizer,
-                    conversation=conversation,
-                    chat_template=request.chat_template or self.chat_template,
-                    add_generation_prompt=request.add_generation_prompt,
-                    continue_final_message=request.continue_final_message,
-                    tools=tool_dicts,
-                    documents=request.documents,
-                    **(request.chat_template_kwargs or {}),
-                )
-        except Exception as e:
-            logger.exception("Error in applying chat template from request")
-            return self.create_error_response(str(e))
-
-        try:
-            mm_data = await mm_data_future
-        except Exception as e:
-            logger.exception("Error in loading multi-modal data")
+            (
+                conversation,
+                request_prompts,
+                engine_prompts,
+            ) = await self._preprocess_chat(
+                request,
+                tokenizer,
+                request.messages,
+                chat_template=request.chat_template or self.chat_template,
+                add_generation_prompt=request.add_generation_prompt,
+                continue_final_message=request.continue_final_message,
+                tool_dicts=tool_dicts,
+                documents=request.documents,
+                chat_template_kwargs=request.chat_template_kwargs,
+                tool_parser=tool_parser,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
+                add_special_tokens=request.add_special_tokens,
+            )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-        # validation for OpenAI tools
-        # tool_choice = "required" is not supported
-        if request.tool_choice == "required":
-            return self.create_error_response(
-                "tool_choice = \"required\" is not supported!")
-
-        if not is_mistral_tokenizer and request.tool_choice == "auto" and not (
-                self.enable_auto_tools and self.tool_parser is not None):
-            # for hf tokenizers, "auto" tools requires
-            # --enable-auto-tool-choice and --tool-call-parser
-            return self.create_error_response(
-                "\"auto\" tool choice requires "
-                "--enable-auto-tool-choice and --tool-call-parser to be set")
-
-        request_id = f"chat-{request.request_id}"
+        request_id = f"chatcmpl-{request.request_id}"
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[RequestOutput, None]] = []
         try:
-            if self.enable_auto_tools and self.tool_parser:
-                request = self.tool_parser(tokenizer).adjust_request(
-                    request=request)
-
-            if isinstance(prompt, str):
-                prompt_inputs = self._tokenize_prompt_input(
-                    request,
-                    tokenizer,
-                    prompt,
-                    truncate_prompt_tokens=request.truncate_prompt_tokens,
-                    add_special_tokens=request.add_special_tokens,
-                )
-            else:
-                assert isinstance(prompt, list) and isinstance(
-                    prompt[0], int
-                ), "Prompt has to be either a string or a list of token ids"
-                prompt_inputs = TextTokensPrompt(
-                    prompt=tokenizer.decode(prompt), prompt_token_ids=prompt)
-
-            assert prompt_inputs is not None
-
-            sampling_params: Union[SamplingParams, BeamSearchParams]
-            default_max_tokens = self.max_model_len - len(
-                prompt_inputs["prompt_token_ids"])
-            if request.use_beam_search:
-                sampling_params = request.to_beam_search_params(
-                    default_max_tokens)
-            else:
-                sampling_params = request.to_sampling_params(
-                    default_max_tokens)
-
-            self._log_inputs(request_id,
-                             prompt_inputs,
-                             params=sampling_params,
-                             lora_request=lora_request,
-                             prompt_adapter_request=prompt_adapter_request)
-
-            engine_inputs = TokensPrompt(
-                prompt_token_ids=prompt_inputs["prompt_token_ids"])
-            if mm_data is not None:
-                engine_inputs["multi_modal_data"] = mm_data
-
-            is_tracing_enabled = (await
-                                  self.engine_client.is_tracing_enabled())
-            trace_headers = None
-            if is_tracing_enabled and raw_request:
-                trace_headers = extract_trace_headers(raw_request.headers)
-            if (not is_tracing_enabled and raw_request
-                    and contains_trace_headers(raw_request.headers)):
-                log_tracing_disabled_warning()
-
-            if isinstance(sampling_params, BeamSearchParams):
-                result_generator = self.engine_client.beam_search(
-                    prompt=engine_inputs,
-                    model_config=self.model_config,
-                    request_id=request_id,
-                    params=sampling_params,
-                )
-            else:
-                result_generator = self.engine_client.generate(
-                    engine_inputs,
-                    sampling_params,
-                    request_id,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    prompt_adapter_request=prompt_adapter_request,
-                    priority=request.priority,
-                )
+            for i, engine_prompt in enumerate(engine_prompts):
+                sampling_params: Union[SamplingParams, BeamSearchParams]
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
+                if request.use_beam_search:
+                    sampling_params = request.to_beam_search_params(
+                        default_max_tokens)
+                else:
+                    sampling_params = request.to_sampling_params(
+                        default_max_tokens)
+
+                self._log_inputs(request_id,
+                                 request_prompts[i],
+                                 params=sampling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                if isinstance(sampling_params, BeamSearchParams):
+                    generator = self.engine_client.beam_search(
+                        prompt=engine_prompt,
+                        model_config=self.model_config,
+                        request_id=request_id,
+                        params=sampling_params,
+                    )
+                else:
+                    generator = self.engine_client.generate(
+                        engine_prompt,
+                        sampling_params,
+                        request_id,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        prompt_adapter_request=prompt_adapter_request,
+                        priority=request.priority,
+                    )
+
+                generators.append(generator)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
+        assert len(generators) == 1
+        result_generator, = generators
+
         if raw_request:
             result_generator = iterate_with_cancellation(
                 result_generator, raw_request.is_disconnected)
@@ -626,6 +581,9 @@ async def chat_completion_full_generator(
                 final_res = res
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
         assert final_res is not None
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index da521a6012530..570232be38379 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,7 +1,6 @@
 import asyncio
 import time
-from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
-                    Optional)
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Tuple, Union, cast
 
@@ -30,18 +29,11 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
-from vllm.tracing import (contains_trace_headers, extract_trace_headers,
-                          log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
 
-TypeTokenIDs = List[int]
-TypeTopLogProbs = List[Optional[Dict[int, float]]]
-TypeCreateLogProbsFn = Callable[
-    [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs]
-
 
 class OpenAIServingCompletion(OpenAIServing):
 
@@ -101,8 +93,6 @@ async def create_completion(
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
-        # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[RequestOutput, None]] = []
         try:
             (
                 lora_request,
@@ -111,19 +101,24 @@ async def create_completion(
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-            prompts = list(
-                self._tokenize_prompt_input_or_inputs(
-                    request,
-                    tokenizer,
-                    request.prompt,
-                    truncate_prompt_tokens=request.truncate_prompt_tokens,
-                    add_special_tokens=request.add_special_tokens,
-                ))
+            request_prompts, engine_prompts = self._preprocess_completion(
+                request,
+                tokenizer,
+                request.prompt,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
+                add_special_tokens=request.add_special_tokens,
+            )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
-            for i, prompt_inputs in enumerate(prompts):
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[RequestOutput, None]] = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
-                    prompt_inputs["prompt_token_ids"])
+                    engine_prompt["prompt_token_ids"])
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
                         default_max_tokens)
@@ -134,36 +129,24 @@ async def create_completion(
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
-                                 prompt_inputs,
+                                 request_prompts[i],
                                  params=sampling_params,
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                is_tracing_enabled = (await
-                                      self.engine_client.is_tracing_enabled())
-                trace_headers = None
-                if is_tracing_enabled:
-                    trace_headers = extract_trace_headers(raw_request.headers)
-                if not is_tracing_enabled and contains_trace_headers(
-                        raw_request.headers):
-                    log_tracing_disabled_warning()
+                trace_headers = (await
+                                 self._get_trace_headers(raw_request.headers))
 
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.engine_client.beam_search(
-                        prompt={
-                            "prompt_token_ids":
-                            prompt_inputs["prompt_token_ids"]
-                        },
+                        prompt=engine_prompt,
                         model_config=self.model_config,
                         request_id=request_id,
                         params=sampling_params,
                     )
                 else:
                     generator = self.engine_client.generate(
-                        {
-                            "prompt_token_ids":
-                            prompt_inputs["prompt_token_ids"]
-                        },
+                        engine_prompt,
                         sampling_params,
                         request_id_item,
                         lora_request=lora_request,
@@ -180,6 +163,8 @@ async def create_completion(
         result_generator = merge_async_iterators(
             *generators, is_cancelled=raw_request.is_disconnected)
 
+        num_prompts = len(engine_prompts)
+
         # Similar to the OpenAI API, when n != best_of, we do not stream the
         # results. In addition, we do not stream the results when use
         # beam search.
@@ -195,16 +180,22 @@ async def create_completion(
                 request_id,
                 created_time,
                 model_name,
-                num_prompts=len(prompts),
+                num_prompts=num_prompts,
                 tokenizer=tokenizer,
                 request_metadata=request_metadata)
 
         # Non-streaming response
-        final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
+        final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
+        try:
             for i, final_res in enumerate(final_res_batch):
                 assert final_res is not None
 
@@ -212,7 +203,7 @@ async def create_completion(
                 # We did not pass it into vLLM engine to avoid being redundant
                 # with the inputs token IDs
                 if final_res.prompt is None:
-                    final_res.prompt = prompts[i]["prompt"]
+                    final_res.prompt = request_prompts[i]["prompt"]
 
             final_res_batch_checked = cast(List[RequestOutput],
                                            final_res_batch)
@@ -226,8 +217,6 @@ async def create_completion(
                 tokenizer,
                 request_metadata,
             )
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 6c46aae2838f6..917856cd2b2dd 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -9,8 +9,10 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (EmbeddingRequest,
+from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
+                                              EmbeddingRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse, UsageInfo)
@@ -21,8 +23,6 @@
 
 logger = init_logger(__name__)
 
-TypeTokenIDs = List[int]
-
 
 def _get_embedding(
     output: EmbeddingOutput,
@@ -76,6 +76,7 @@ def __init__(
         base_model_paths: List[BaseModelPath],
         *,
         request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
@@ -83,21 +84,20 @@ def __init__(
                          lora_modules=None,
                          prompt_adapters=None,
                          request_logger=request_logger)
-        self._enabled = self._check_embedding_mode(
-            model_config.task == "embedding")
+
+        self.chat_template = load_chat_template(chat_template)
 
     async def create_embedding(
         self,
         request: EmbeddingRequest,
         raw_request: Optional[Request] = None,
     ) -> Union[EmbeddingResponse, ErrorResponse]:
-        """Completion API similar to OpenAI's API.
+        """
+        Embedding API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/embeddings/create
         for the API specification. This API mimics the OpenAI Embedding API.
         """
-        if not self._enabled:
-            return self.create_error_response("Embedding API disabled")
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -122,8 +122,6 @@ async def create_embedding(
                     "greater than max_model_len."
                     " Please, select a smaller truncation size.")
 
-        # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
         try:
             (
                 lora_request,
@@ -132,32 +130,60 @@ async def create_embedding(
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-            pooling_params = request.to_pooling_params()
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for embedding models")
+
+            if isinstance(request, EmbeddingChatRequest):
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            else:
+                request_prompts, engine_prompts = self._preprocess_completion(
+                    request,
+                    tokenizer,
+                    request.input,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
-            prompts = list(
-                self._tokenize_prompt_input_or_inputs(request, tokenizer,
-                                                      request.input,
-                                                      truncate_prompt_tokens))
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
+        try:
+            pooling_params = request.to_pooling_params()
 
-            for i, prompt_inputs in enumerate(prompts):
+            for i, engine_prompt in enumerate(engine_prompts):
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
-                                 prompt_inputs,
+                                 request_prompts[i],
                                  params=pooling_params,
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                if prompt_adapter_request is not None:
-                    raise NotImplementedError(
-                        "Prompt adapter is not supported "
-                        "for embedding models")
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
 
                 generator = self.engine_client.encode(
-                    {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
+                    engine_prompt,
                     pooling_params,
                     request_id_item,
                     lora_request=lora_request,
+                    trace_headers=trace_headers,
                     priority=request.priority,
                 )
 
@@ -171,13 +197,18 @@ async def create_embedding(
             is_cancelled=raw_request.is_disconnected if raw_request else None,
         )
 
+        num_prompts = len(engine_prompts)
+
         # Non-streaming response
         final_res_batch: List[Optional[EmbeddingRequestOutput]]
-        final_res_batch = [None] * len(prompts)
+        final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
 
+        try:
             for final_res in final_res_batch:
                 assert final_res is not None
 
@@ -187,18 +218,8 @@ async def create_embedding(
             response = request_output_to_embedding_response(
                 final_res_batch_checked, request_id, created_time, model_name,
                 encoding_format)
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
         return response
-
-    def _check_embedding_mode(self, embedding_mode: bool) -> bool:
-        if not embedding_mode:
-            logger.warning(
-                "embedding_mode is False. Embedding API will not work.")
-        else:
-            logger.info("Activating the server engine with embedding enabled.")
-        return embedding_mode
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 22a01b3dc4cc0..e7aeac8f8c018 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -2,28 +2,38 @@
 import pathlib
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union
+from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
+                    Optional, Sequence, Tuple, TypedDict, Union)
 
 from pydantic import Field
+from starlette.datastructures import Headers
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ConversationMessage,
+                                         apply_hf_chat_template,
+                                         apply_mistral_chat_template,
+                                         parse_chat_messages_futures)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest,
                                               DetokenizeRequest,
-                                              EmbeddingRequest, ErrorResponse,
+                                              EmbeddingChatRequest,
+                                              EmbeddingCompletionRequest,
+                                              ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
-                                              TokenizeRequest,
                                               UnloadLoraAdapterRequest)
+from vllm.entrypoints.openai.tool_parsers import ToolParser
 # yapf: enable
+from vllm.inputs import TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -31,8 +41,10 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import AtomicCounter
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import AtomicCounter, is_list_of
 
 logger = init_logger(__name__)
 
@@ -56,8 +68,14 @@ class LoRAModulePath:
     base_model_name: Optional[str] = None
 
 
-AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest,
-                   EmbeddingRequest, TokenizeRequest]
+CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
+                              EmbeddingCompletionRequest,
+                              TokenizeCompletionRequest]
+
+ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
+                        TokenizeChatRequest]
+
+AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest]
 
 
 class TextTokensPrompt(TypedDict):
@@ -65,6 +83,9 @@ class TextTokensPrompt(TypedDict):
     prompt_token_ids: List[int]
 
 
+RequestPrompt = Union[List[int], str, TextTokensPrompt]
+
+
 class OpenAIServing:
 
     def __init__(
@@ -246,7 +267,8 @@ def _validate_input(
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest doesn't have max_tokens
-        if isinstance(request, EmbeddingRequest):
+        if isinstance(request,
+                      (EmbeddingChatRequest, EmbeddingCompletionRequest)):
             if token_num > self.max_model_len:
                 raise ValueError(
                     f"This model's maximum context length is "
@@ -373,10 +395,115 @@ def _tokenize_prompt_input_or_inputs(
                     truncate_prompt_tokens=truncate_prompt_tokens,
                 )
 
+    def _preprocess_completion(
+        self,
+        request: CompletionLikeRequest,
+        tokenizer: AnyTokenizer,
+        input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]:
+        request_prompts = [
+            request_prompt
+            for request_prompt in self._tokenize_prompt_input_or_inputs(
+                request,
+                tokenizer,
+                input_or_inputs,
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens,
+            )
+        ]
+
+        engine_prompts = [
+            TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"])
+            for request_prompt in request_prompts
+        ]
+
+        return request_prompts, engine_prompts
+
+    async def _preprocess_chat(
+        self,
+        request: ChatLikeRequest,
+        tokenizer: AnyTokenizer,
+        messages: List[ChatCompletionMessageParam],
+        chat_template: Optional[str] = None,
+        add_generation_prompt: bool = True,
+        continue_final_message: bool = False,
+        tool_dicts: Optional[List[Dict[str, Any]]] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
+        chat_template_kwargs: Optional[Dict[str, Any]] = None,
+        tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = False,
+    ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
+               List[TokensPrompt]]:
+        conversation, mm_data_future = parse_chat_messages_futures(
+            messages,
+            self.model_config,
+            tokenizer,
+        )
+
+        request_prompt: Union[str, List[int]]
+        is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
+        if is_mistral_tokenizer:
+            request_prompt = apply_mistral_chat_template(
+                tokenizer,
+                messages=messages,
+                chat_template=chat_template,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=continue_final_message,
+                tools=tool_dicts,
+                documents=documents,
+                **(chat_template_kwargs or {}),
+            )
+        else:
+            request_prompt = apply_hf_chat_template(
+                tokenizer,
+                conversation=conversation,
+                chat_template=chat_template,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=continue_final_message,
+                tools=tool_dicts,
+                documents=documents,
+                **(chat_template_kwargs or {}),
+            )
+
+        mm_data = await mm_data_future
+
+        if tool_parser is not None:
+            if not isinstance(request, ChatCompletionRequest):
+                msg = "Tool usage is only supported for Chat Completions API"
+                raise NotImplementedError(msg)
+
+            request = tool_parser(tokenizer).adjust_request(request=request)
+
+        if isinstance(request_prompt, str):
+            prompt_inputs = self._tokenize_prompt_input(
+                request,
+                tokenizer,
+                request_prompt,
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens,
+            )
+        else:
+            # For MistralTokenizer
+            assert is_list_of(request_prompt, int), (
+                "Prompt has to be either a string or a list of token ids")
+            prompt_inputs = TextTokensPrompt(
+                prompt=tokenizer.decode(request_prompt),
+                prompt_token_ids=request_prompt)
+
+        engine_prompt = TokensPrompt(
+            prompt_token_ids=prompt_inputs["prompt_token_ids"])
+        if mm_data is not None:
+            engine_prompt["multi_modal_data"] = mm_data
+
+        return conversation, [request_prompt], [engine_prompt]
+
     def _log_inputs(
         self,
         request_id: str,
-        inputs: Union[str, List[int], TextTokensPrompt],
+        inputs: RequestPrompt,
         params: Optional[Union[SamplingParams, PoolingParams,
                                BeamSearchParams]],
         lora_request: Optional[LoRARequest],
@@ -404,6 +531,20 @@ def _log_inputs(
             prompt_adapter_request=prompt_adapter_request,
         )
 
+    async def _get_trace_headers(
+        self,
+        headers: Headers,
+    ) -> Optional[Mapping[str, str]]:
+        is_tracing_enabled = await self.engine_client.is_tracing_enabled()
+
+        if is_tracing_enabled:
+            return extract_trace_headers(headers)
+
+        if contains_trace_headers(headers):
+            log_tracing_disabled_warning()
+
+        return None
+
     @staticmethod
     def _get_decoded_token(logprob: Logprob,
                            token_id: int,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index a269c94c7ec0d..1fd82304f7a4d 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -2,10 +2,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
-                                         apply_mistral_chat_template,
-                                         load_chat_template,
-                                         parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -20,7 +17,6 @@
                                                     LoRAModulePath,
                                                     OpenAIServing)
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -62,59 +58,51 @@ async def create_tokenize(
 
         request_id = f"tokn-{random_uuid()}"
 
-        (
-            lora_request,
-            prompt_adapter_request,
-        ) = self._maybe_get_adapters(request)
-
-        tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
-        prompt: Union[str, List[int]]
-        if isinstance(request, TokenizeChatRequest):
-            model_config = self.model_config
-
-            conversation, mm_data_future = parse_chat_messages_futures(
-                request.messages, model_config, tokenizer)
-
-            mm_data = await mm_data_future
-            if mm_data:
-                logger.warning(
-                    "Multi-modal inputs are ignored during tokenization")
-
-            if isinstance(tokenizer, MistralTokenizer):
-                prompt = apply_mistral_chat_template(
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if isinstance(request, TokenizeChatRequest):
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
                     tokenizer,
-                    messages=request.messages,
+                    request.messages,
                     chat_template=self.chat_template,
                     add_generation_prompt=request.add_generation_prompt,
                     continue_final_message=request.continue_final_message,
+                    add_special_tokens=request.add_special_tokens,
                 )
             else:
-                prompt = apply_hf_chat_template(
+                request_prompts, engine_prompts = self._preprocess_completion(
+                    request,
                     tokenizer,
-                    conversation=conversation,
-                    chat_template=self.chat_template,
-                    add_generation_prompt=request.add_generation_prompt,
-                    continue_final_message=request.continue_final_message,
+                    request.prompt,
+                    add_special_tokens=request.add_special_tokens,
                 )
-        else:
-            prompt = request.prompt
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
-        self._log_inputs(request_id,
-                         prompt,
-                         params=None,
-                         lora_request=lora_request,
-                         prompt_adapter_request=prompt_adapter_request)
+        input_ids: List[int] = []
+        for i, engine_prompt in enumerate(engine_prompts):
+            self._log_inputs(request_id,
+                             request_prompts[i],
+                             params=None,
+                             lora_request=lora_request,
+                             prompt_adapter_request=prompt_adapter_request)
 
-        # Silently ignore prompt adapter since it does not affect tokenization
+            # Silently ignore prompt adapter since it does not affect
+            # tokenization (Unlike in Embeddings API where an error is raised)
 
-        prompt_input = self._tokenize_prompt_input(
-            request,
-            tokenizer,
-            prompt,
-            add_special_tokens=request.add_special_tokens,
-        )
-        input_ids = prompt_input["prompt_token_ids"]
+            input_ids.extend(engine_prompt["prompt_token_ids"])
 
         return TokenizeResponse(tokens=input_ids,
                                 count=len(input_ids),
@@ -143,9 +131,8 @@ async def create_detokenize(
                          lora_request=lora_request,
                          prompt_adapter_request=prompt_adapter_request)
 
-        if prompt_adapter_request is not None:
-            raise NotImplementedError("Prompt adapter is not supported "
-                                      "for tokenization")
+        # Silently ignore prompt adapter since it does not affect tokenization
+        # (Unlike in Embeddings API where an error is raised)
 
         prompt_input = self._tokenize_prompt_input(
             request,
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 7461fb51989c6..2635c0bccd1c4 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """Pooling parameters for pooling.
+    """Pooling parameters for embeddings API.
 
     Attributes:
         additional_data: Any additional data needed for pooling.
@@ -16,7 +16,7 @@ class PoolingParams(
 
     def clone(self) -> "PoolingParams":
         """Returns a deep copy of the PoolingParams instance."""
-        return PoolingParams(additional_data=self.additional_data, )
+        return PoolingParams(additional_data=self.additional_data)
 
     def __repr__(self) -> str:
         return (f"PoolingParams("

From 30a2e8074246e11a1452ab5e84a7be65ecac6119 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 1 Nov 2024 09:55:29 -0400
Subject: [PATCH 12/85] [CI/Build] Add Model Tests for PixtralHF (#9813)

---
 tests/models/decoder_only/vision_language/test_models.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index d738647c91b66..e49ea6f98324d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -291,6 +291,15 @@
     #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
     #     num_logprobs=10,
     # ),
+    "pixtral_hf": VLMTestInfo(
+        models=["nm-testing/pixtral-12b-FP8-dynamic"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
+        img_idx_to_prompt=lambda idx: "[IMG]",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+    ),
     "qwen": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

From ba0d8920742597269745f3551eb97b1b19f5e582 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 22:09:07 +0800
Subject: [PATCH 13/85] [Frontend] Use a proper chat template for VLM2Vec
 (#9912)

---
 docs/source/models/vlm.rst                    | 14 +++++---
 ..._chat_completion_client_for_multimodal.py} |  0
 ...ai_chat_embedding_client_for_multimodal.py | 33 +++++++++++++++++++
 examples/template_vlm2vec.jinja               | 16 +++++++++
 .../openai/test_vision_embedding.py           | 11 +++++--
 vllm/entrypoints/chat_utils.py                | 15 ++++++---
 6 files changed, 78 insertions(+), 11 deletions(-)
 rename examples/{openai_api_client_for_multimodal.py => openai_chat_completion_client_for_multimodal.py} (100%)
 create mode 100644 examples/openai_chat_embedding_client_for_multimodal.py
 create mode 100644 examples/template_vlm2vec.jinja

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index ac6405b9807a8..3377502a6db28 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -240,8 +240,7 @@ To consume the server, you can use the OpenAI client like in the example below:
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 
-
-A full code example can be found in `examples/openai_api_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_api_client_for_multimodal.py>`_.
+A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
 
 .. tip::
     There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
@@ -269,14 +268,19 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
 .. code-block:: bash
 
     vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
-      --trust-remote-code --max-model-len 4096
+      --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 
 .. important::
 
     Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
     to run this model in embedding mode instead of text generation mode.
 
-Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+.. important::
+
+    VLM2Vec does not expect chat-based input. We use a `custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`_
+    to combine the text and images together.
+
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
 
 .. code-block:: python
 
@@ -301,3 +305,5 @@ Since this schema is not defined by OpenAI client, we post a request to the serv
     response.raise_for_status()
     response_json = response.json()
     print("Embedding output:", response_json["data"][0]["embedding"])
+
+A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_api_client_for_multimodal.py
rename to examples/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
new file mode 100644
index 0000000000000..effb588e1387f
--- /dev/null
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -0,0 +1,33 @@
+import requests
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+response = requests.post(
+    "http://localhost:8000/v1/embeddings",
+    json={
+        "model":
+        "TIGER-Lab/VLM2Vec-Full",
+        "messages": [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Represent the given image."
+                },
+            ],
+        }],
+        "encoding_format":
+        "float",
+    },
+)
+response.raise_for_status()
+response_json = response.json()
+
+print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja
new file mode 100644
index 0000000000000..489b99604af38
--- /dev/null
+++ b/examples/template_vlm2vec.jinja
@@ -0,0 +1,16 @@
+{%- if messages | length > 1 -%}
+    {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+
+{% set vars = namespace(parts=[], next_image_id=1) %}
+{%- for message in messages -%}
+    {%- for content in message['content'] -%}
+        {%- if content['type'] == 'text' -%}
+            {%- set vars.parts = vars.parts + [content['text']] %}
+        {%- elif content['type'] == 'image' -%}
+            {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
+            {%- set vars.next_image_id = vars.next_image_id + 1 %}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endfor -%}
+{{ vars.parts | join(' ') }}
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 73a69da32e434..d0c43b47bf0af 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -6,11 +6,14 @@
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
 
+vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
+assert vlm2vec_jinja_path.exists()
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -35,6 +38,8 @@ def server():
         "--trust-remote-code",
         "--limit-mm-per-prompt",
         f"image={MAXIMUM_IMAGES}",
+        "--chat-template",
+        str(vlm2vec_jinja_path),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -90,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings["data"]) == 1
     assert len(embeddings["data"][0]["embedding"]) == 3072
     assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 771
-    assert embeddings["usage"]["total_tokens"] == 771
+    assert embeddings["usage"]["prompt_tokens"] == 762
+    assert embeddings["usage"]["total_tokens"] == 762
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ce36f20760f4c..bc2de2d162473 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -156,6 +156,10 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
 
         self._items: List[_T] = []
 
+    @property
+    def model_config(self) -> ModelConfig:
+        return self._model_config
+
     @staticmethod
     @lru_cache(maxsize=None)
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -491,10 +495,13 @@ def _parse_chat_message_content_parts(
     content: List[Union[str, Dict[str, str]]] = []
 
     mm_parser = mm_tracker.create_parser()
-    wrap_dicts = \
-        mm_tracker._model_config.hf_config.model_type in \
-            MODEL_KEEP_MULTI_MODAL_CONTENT or \
-        (chat_template_text_format == "openai")
+    model_config = mm_tracker.model_config
+
+    wrap_dicts = (chat_template_text_format == "openai"
+                  or (model_config.task == "embedding"
+                      and model_config.is_multimodal_model)
+                  or (model_config.hf_config.model_type
+                      in MODEL_KEEP_MULTI_MODAL_CONTENT))
 
     for part in parts:
         parse_res = _parse_chat_message_content_part(

From 1dd4cb2935fc3fff9c156b5772d18e0a0d1861f0 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 1 Nov 2024 11:33:15 -0600
Subject: [PATCH 14/85] [Bugfix] Fix edge cases for MistralTokenizer (#9625)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/tokenization/test_detokenize.py         | 80 +++++++++++++++----
 vllm/transformers_utils/tokenizers/mistral.py | 64 ++++++++++-----
 2 files changed, 105 insertions(+), 39 deletions(-)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index f4551ed42efb8..1d07885349409 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Generator, List, Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -7,11 +7,17 @@
 from vllm.transformers_utils.detokenizer import (Detokenizer,
                                                  detokenize_incrementally)
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 TRUTH = [
     "Hello here, this is a simple test",
     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
-    "我很感谢你的热情"
+    "我很感谢你的热情",
+    # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
+    # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
+    # incomplete UTF-8 characters
+    # see https://github.com/vllm-project/vllm/pull/9625
+    "ပုံပြင်လေးပြောပြပါ်",
 ]
 TOKENIZERS = [
     "facebook/opt-125m",
@@ -24,6 +30,7 @@
     "tiiuae/falcon-7b",
     "meta-llama/Llama-2-7b-hf",
     "codellama/CodeLlama-7b-hf",
+    "mistralai/Pixtral-12B-2409",
 ]
 
 
@@ -49,15 +56,55 @@ def _run_incremental_decode(tokenizer, all_input_ids,
     return decoded_text
 
 
+@pytest.fixture
+def tokenizer(tokenizer_name):
+    return (MistralTokenizer.from_pretrained(tokenizer_name)
+            if "mistral" in tokenizer_name else
+            AutoTokenizer.from_pretrained(tokenizer_name))
+
+
+@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
+@pytest.mark.parametrize(
+    "truth",
+    [
+        # Burmese text triggers an edge-case where tokens may map to bytes with
+        # incomplete UTF-8 characters
+        "ပုံပြင်လေးပြောပြပါ",
+        # Using "URGENCY" since "CY" has token id 130282
+        "URGENCY🌶️",
+    ])
+def test_mistral_edge_case(tokenizer, truth):
+    """Test for a specific edge cases with V3-Tekken MistralTokenizer.
+
+    See https://github.com/vllm-project/vllm/pull/9625
+    """
+    starting_index = 0
+    all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
+
+    decoded_text = _run_incremental_decode(tokenizer,
+                                           all_input_ids,
+                                           skip_special_tokens=True,
+                                           starting_index=starting_index)
+    assert decoded_text == truth
+
+
+@pytest.fixture
+def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
+    if "mistral" in tokenizer_name:
+        yield (
+            bool(True) if request.param else
+            pytest.skip("mistral doesn't support skip_special_tokens=False"))
+    else:
+        yield bool(True) if request.param else bool(False)
+
+
 @pytest.mark.parametrize("truth", TRUTH)
 @pytest.mark.parametrize("with_prompt", [True, False])
-@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", (True, False))
-def test_decode_streaming(tokenizer_id, truth, with_prompt,
-                          skip_special_tokens):
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
     if with_prompt:
-        truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"]
+        truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
         prompt_input_ids = truth_tokens[:len(truth) // 2]
         generated_input_ids = truth_tokens[len(truth) // 2:]
         all_input_ids = prompt_input_ids + generated_input_ids
@@ -68,7 +115,7 @@ def test_decode_streaming(tokenizer_id, truth, with_prompt,
     else:
         generated = truth
         starting_index = 0
-        all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
+        all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
     if skip_special_tokens:
         if tokenizer.bos_token_id is not None:
             all_input_ids = [tokenizer.bos_token_id] + all_input_ids
@@ -98,7 +145,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
         enable_lora=False,
         max_num_seqs=100,
         max_input_length=None,
-        tokenizer_mode="auto",
+        tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
         trust_remote_code=False,
         revision=None,
     )
@@ -113,9 +160,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
-                                       tokenizer_name: str) -> List[int]:
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"]
+                                       tokenizer) -> List[int]:
+    complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
     return complete_sequence_token_ids
 
 
@@ -150,7 +196,7 @@ def create_dummy_prompt_logprobs(
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
 def test_decode_sequence_logprobs(complete_sequence: str,
                                   complete_sequence_token_ids: List[int],
                                   detokenizer: Detokenizer,
@@ -208,9 +254,9 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
 
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
-    tokenzier = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = tokenzier.decode(token_ids, skip_special_tokens=True)
-    text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True)
+    tokenizer = detokenizer.get_tokenizer_for_seq(seq)
+    text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
+    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
     text = text_full[len(text_first):]
 
     # Text for logprobs for the chosen token should be the same as the
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 80e21c2d32ecc..896f70bc1dafd 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -16,9 +16,13 @@
 from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
                                                      Tekkenizer)
 
+from vllm.logger import init_logger
+
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class Encoding:
@@ -72,20 +76,21 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
             # Make sure special tokens will not raise
             tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
 
-            self._vocab = {
-                token: idx
-                for idx, token in enumerate(tokenizer_.vocab())
-            }
         elif isinstance(tokenizer_, SentencePieceTokenizer):
-            self._vocab = {
-                token: idx
-                for idx, token in enumerate(tokenizer_.vocab())
-            }
+            pass
         else:
             raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
 
+        self._vocab = tokenizer_.vocab()
+        # Convert to a Dict[str, int] to match protocol, but this is a lossy
+        # conversion. There may be multiple token ids that decode to the same
+        # string due to partial UTF-8 byte sequences being converted to �
+        self._vocab_dict = {
+            token: idx
+            for idx, token in enumerate(self._vocab)
+        }
         self.tokenizer = tokenizer_
-        self._max_token_id = max(self._vocab.values())
+        self._max_token_id = self.vocab_size - 1
 
     @classmethod
     def from_pretrained(cls,
@@ -182,7 +187,9 @@ def __call__(
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
-        return self._vocab
+        # NB: the dictionary form of the vocabulary collapses token ids that map
+        # to the same string but have different bytes
+        return self._vocab_dict
 
     def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary
@@ -220,14 +227,20 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
             if any(isinstance(t, bytes) for t in tokens):
                 # we need to encode and decode all tokens again
                 shift = self.tokenizer.num_special_tokens
-                byte_tokens = [
-                    t.encode("utf-8") if not isinstance(t, bytes) else t
-                    for t in tokens
-                ]
-                ids = [
-                    self.tokenizer._tekken_token2id_nospecial[t] + shift
-                    for t in byte_tokens
-                ]
+
+                def _token_to_id(t: str):
+                    t_bytes = t.encode("utf-8") \
+                        if not isinstance(t, bytes) else t
+                    try:
+                        return shift + \
+                            self.tokenizer._tekken_token2id_nospecial[t_bytes]
+                    except KeyError:
+                        logger.warning(
+                            "Failed to convert token %s to id,"
+                            " replacing with <unk>", t_bytes)
+                        return self.tokenizer.unk_id
+
+                ids = [_token_to_id(t) for t in tokens]
                 decoded = self.tokenizer.decode(ids)
             else:
                 decoded = "".join(tokens)
@@ -236,7 +249,13 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
 
         return decoded
 
-    def decode(self, ids: Union[List[int], int]) -> str:
+    def decode(self,
+               ids: Union[List[int], int],
+               skip_special_tokens: bool = True) -> str:
+        assert (
+            skip_special_tokens
+        ), "Skipping special tokens is not supported for Mistral tokenizers."
+
         if isinstance(ids, int):
             ids = [ids]
         return self.tokenizer.decode(ids)
@@ -257,10 +276,11 @@ def convert_ids_to_tokens(
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
-        if any(t.strip() == "�" for t in tokens):
-            # if any stripped decoded token is undefined
-            # because it's invalid unicode then pass bytes
+        if any("�" in t for t in tokens):
+            # if a decoded token contains the replacement character, then the
+            # token has an incomplete UTF-8 character so we must use bytes
             # See: https://github.com/vllm-project/vllm/pull/8640
+            #      https://github.com/vllm-project/vllm/pull/9625
             tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
 
         return tokens

From 4581d2cc02f655e76233f9cb129f07c6b65d39f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Jonasson?= <andre.jonasson@gmail.com>
Date: Fri, 1 Nov 2024 19:41:38 +0100
Subject: [PATCH 15/85] [Core] Refactor: Clean up unused argument in
 Scheduler._preempt (#9696)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: André Jonasson <andre.jonasson@gmail.com>
---
 vllm/core/scheduler.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 88733b8f53b86..e35c05f4fe7f7 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -828,8 +828,7 @@ def _schedule_priority_preemption(
                                          num_running_seqs)
 
                 #Preempt out the victim sequence group
-                self._preempt(vseq_group, blocks_to_swap_out,
-                              PreemptionMode.RECOMPUTE)
+                self._preempt(vseq_group, blocks_to_swap_out)
                 waiting_queue.appendleft(vseq_group)
                 force_preemption_count += 1
             #Put the sequence back into the waiting queue
@@ -1451,12 +1450,8 @@ def _append_slots(self,
             if len(cows) > 0:
                 blocks_to_copy.extend(cows)
 
-    def _preempt(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: List[Tuple[int, int]],
-        preemption_mode: Optional[PreemptionMode] = None,
-    ) -> PreemptionMode:
+    def _preempt(self, seq_group: SequenceGroup,
+                 blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
         # If preemption mode is not specified, we determine the mode as follows:
         # We use recomputation by default since it incurs lower overhead than
         # swapping. However, when the sequence group has multiple sequences

From aff1fd81881bf29f82ad6ba55b301828764cd120 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 1 Nov 2024 11:50:37 -0700
Subject: [PATCH 16/85] [torch.compile] use interpreter with stable api from
 pytorch (#9889)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 165 +++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 76 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 10cf49e19eccc..96ddcba467c5b 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -243,6 +243,65 @@ def split_graph(graph: fx.GraphModule,
     return split_gm, outputs
 
 
+# we share the global graph pool among all the backends
+global_graph_pool = None
+
+
+class PiecewiseCompileInterpreter(torch.fx.Interpreter):
+    """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
+    It runs the given graph with fake inputs, and compile some
+    submodules specified by `compile_submod_names` with the given
+    compilation configs.
+    """
+
+    def __init__(self, module: torch.fx.GraphModule,
+                 compile_submod_names: List[str],
+                 compilation_configs: CompilationConfig, graph_pool):
+        super().__init__(module)
+        from torch._guards import detect_fake_mode
+        self.fake_mode = detect_fake_mode()
+        self.compile_submod_names = compile_submod_names
+        self.compilation_configs = compilation_configs
+        self.graph_pool = graph_pool
+        self.have_seen_first_graph = False
+
+    def run(self, *args):
+        fake_args = [
+            self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+            for t in args
+        ]
+        return super().run(*fake_args)
+
+    def call_module(self, target: torch.fx.node.Target,
+                    args: Tuple[torch.fx.node.Argument,
+                                ...], kwargs: Dict[str, Any]) -> Any:
+        assert isinstance(target, str)
+        output = super().call_module(target, args, kwargs)
+
+        if target in self.compile_submod_names:
+            submod = self.fetch_attr(target)
+            sym_shape_indices = [
+                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
+            ]
+            compiled_graph_for_general_shape = wrap_inductor(
+                submod,
+                args,
+                self.compilation_configs.inductor_compile_config,
+                runtime_shape=None,
+                do_logging=not self.have_seen_first_graph,
+                use_inductor=self.compilation_configs.use_inductor)
+
+            self.module.__dict__[target] = PiecewiseBackend(
+                submod, self.compilation_configs, self.graph_pool,
+                not self.have_seen_first_graph, sym_shape_indices,
+                compiled_graph_for_general_shape)
+
+            self.have_seen_first_graph = True
+            compilation_counter.num_piecewise_capturable_graphs_seen += 1
+
+        return output
+
+
 class VllmBackend:
     """The compilation backend for `torch.compile` with VLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
@@ -263,8 +322,14 @@ class VllmBackend:
     returned_callable: Callable
 
     def __init__(self, ):
-        # every instance of VllmBackend has its own graph pool
-        self.graph_pool = torch.cuda.graph_pool_handle()
+        global global_graph_pool
+        if global_graph_pool is None:
+            global_graph_pool = torch.cuda.graph_pool_handle()
+
+        # TODO: in the future, if we want to use multiple
+        # streams, it might not be safe to share a global pool.
+        # only investigate this when we use multiple streams
+        self.graph_pool = global_graph_pool
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
@@ -286,55 +351,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         self.split_gm, self.piecewise_graphs = split_graph(
             graph, self.compilation_configs.non_cudagraph_ops)
 
-        returned_callable: Callable  # type: ignore
+        from torch._dynamo.utils import lazy_format_graph_code
+        logger.debug("%s",
+                     lazy_format_graph_code("stiching module", self.split_gm))
 
-        if len(self.piecewise_graphs) == 0:
-            compilation_counter.num_piecewise_graphs_seen += 1
-            compilation_counter.num_piecewise_capturable_graphs_seen += 1
-            returned_callable = PiecewiseBackend(graph,
-                                                 self.compilation_configs,
-                                                 self.graph_pool,
-                                                 is_first_graph=True)
-        else:
-            from torch._dynamo.utils import lazy_format_graph_code
-            logger.debug(
-                "%s", lazy_format_graph_code("stiching module", self.split_gm))
-
-            is_first_graph = True
-
-            for item in self.piecewise_graphs:
-                compilation_counter.num_piecewise_graphs_seen += 1
-                compilation_counter.num_piecewise_capturable_graphs_seen += not item.is_splitting_graph  # noqa
-                if not item.is_splitting_graph:
-                    # cannot setattr to a module, so we need to set
-                    # the attribute in the __dict__
-                    self.split_gm.__dict__[
-                        item.submod_name] = PiecewiseBackend(
-                            item.graph, self.compilation_configs,
-                            self.graph_pool, is_first_graph)
-                    is_first_graph = False
-            returned_callable = self.split_gm
-
-        self.returned_callable = returned_callable
-        # trigger the first compilation
-        # code borrowed from https://github.com/pytorch/pytorch/blob/4e3e08b71171fa34172b2362ff668553fac75f27/torch/_dynamo/backends/distributed.py#L206 # noqa
-        # to turn the inputs into fake tensors
-        import torch._guards
-        from torch._guards import detect_fake_mode
-        fake_mode = detect_fake_mode(example_inputs)
-        fake_args = []
-        for arg in example_inputs:
-            if isinstance(arg, torch.Tensor) and not isinstance(
-                    arg, torch._subclasses.FakeTensor):
-                fake_args.append(
-                    torch._dynamo.utils.to_fake_tensor(arg, fake_mode))
-            else:
-                fake_args.append(arg)
-        self.returned_callable(*fake_args)
+        compilation_counter.num_piecewise_graphs_seen += len(
+            self.piecewise_graphs)
+        submod_names_to_compile = [
+            item.submod_name for item in self.piecewise_graphs
+            if not item.is_splitting_graph
+        ]
+
+        # propagate the split graph to the piecewise backend,
+        # compile submodules with symbolic shapes
+        PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
+                                    self.compilation_configs,
+                                    self.graph_pool).run(*example_inputs)
 
         self._called = True
 
-        return self.returned_callable
+        return self.split_gm
 
 
 @dataclasses.dataclass
@@ -352,11 +388,10 @@ class ConcreteSizeEntry:
 
 class PiecewiseBackend:
 
-    def __init__(self,
-                 graph: fx.GraphModule,
-                 compilation_configs: CompilationConfig,
-                 graph_pool: Any,
-                 is_first_graph: bool = False):
+    def __init__(self, graph: fx.GraphModule,
+                 compilation_configs: CompilationConfig, graph_pool: Any,
+                 is_first_graph: bool, sym_shape_indices: List[int],
+                 compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
         It mainly handles the compilation and cudagraph capturing.
@@ -381,12 +416,11 @@ def __init__(self,
             self.compilation_configs.capture_sizes
         ) if self.compilation_configs.use_cudagraph else set()
 
-        self.compile_finished = False
         self.first_run_finished = False
 
-        self.compiled_graph_for_general_shape: Callable = None  # type: ignore
+        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
 
-        self.sym_shape_indices: List[int] = []
+        self.sym_shape_indices = sym_shape_indices
 
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
@@ -399,27 +433,6 @@ def __init__(self,
             )
 
     def __call__(self, *args) -> Any:
-
-        if not self.compile_finished:
-            self.compile_finished = True
-
-            # this is the first compilation, we will compile a graph with
-            # dynamic shape, as the caller will mark first dimension as dynamic
-
-            self.sym_shape_indices = [
-                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
-            ]
-
-            self.compiled_graph_for_general_shape = wrap_inductor(
-                self.graph,
-                args,
-                self.compilation_configs.inductor_compile_config,
-                runtime_shape=None,
-                do_logging=self.is_first_graph,
-                use_inductor=self.compilation_configs.use_inductor)
-
-            return self.graph(*args)
-
         if not self.first_run_finished:
             self.first_run_finished = True
             return self.compiled_graph_for_general_shape(*args)

From 598b6d7b070149aae5884aa8b17a0c91c93172f5 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Fri, 1 Nov 2024 12:15:05 -0700
Subject: [PATCH 17/85] [Bugfix/Core] Flashinfer k_scale and v_scale (#9861)

---
 tests/kernels/test_cache.py                   | 21 ++++++++++++-------
 vllm/attention/backends/flashinfer.py         |  9 +++++---
 .../layers/quantization/modelopt.py           |  7 +++++--
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 5b8311a33c361..e2b4778b94b9e 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -258,19 +258,20 @@ def test_reshape_and_cache_flash(
     del key_caches
     del value_caches
 
+    k_scale = key.amax().item() / 256
+    v_scale = value.amax().item() / 256
+
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
         cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_key_cache, key_cache)
+        ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype)
         cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_value_cache, value_cache)
+        ops.convert_fp8(cloned_value_cache, value_cache, v_scale,
+                        kv_cache_dtype)
     else:
         cloned_key_cache = key_cache.clone()
         cloned_value_cache = value_cache.clone()
 
-    # Using default kv_scale
-    k_scale = v_scale = 1.0
-
     # Call the reshape_and_cache kernel.
     opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
             (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
@@ -281,9 +282,15 @@ def test_reshape_and_cache_flash(
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(result_key_cache, key_cache)
+        ops.convert_fp8(result_key_cache,
+                        key_cache,
+                        k_scale,
+                        kv_dtype=kv_cache_dtype)
         result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(result_value_cache, value_cache)
+        ops.convert_fp8(result_value_cache,
+                        value_cache,
+                        v_scale,
+                        kv_dtype=kv_cache_dtype)
 
     # Run the reference implementation.
     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 234c87d5c4edb..658805d35be0a 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -759,8 +759,6 @@ def forward(
         v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
-        assert k_scale == 1.0 and v_scale == 1.0, (
-            "key/v_scale is not supported in FlashInfer.")
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
@@ -874,7 +872,12 @@ def unified_flash_infer(
             assert prefill_meta is not None
             assert prefill_meta.prefill_wrapper is not None
             prefill_output = prefill_meta.prefill_wrapper.forward(
-                query, kv_cache, logits_soft_cap=logits_soft_cap, causal=True)
+                query,
+                kv_cache,
+                logits_soft_cap=logits_soft_cap,
+                causal=True,
+                k_scale=k_scale,
+                v_scale=v_scale)
     if decode_meta := attn_metadata.decode_metadata:
         assert attn_metadata.decode_metadata is not None
         assert attn_metadata.decode_metadata.decode_wrapper is not None
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index dc5f47eb9b0fb..9694f2b8208e2 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -141,8 +141,11 @@ def create_weights(
             layer.register_parameter("input_scale", scale)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        max_w_scale, weight = requantize_with_max_scale(
-            layer.weight, layer.weight_scale, layer.logical_widths)
+        weight = layer.weight
+        max_w_scale = layer.weight_scale.max()
+        if not (layer.weight_scale == layer.weight_scale[0]).all():
+            max_w_scale, weight = requantize_with_max_scale(
+                layer.weight, layer.weight_scale, layer.logical_widths)
         layer.weight = Parameter(weight.t(), requires_grad=False)
         layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
         layer.input_scale = Parameter(layer.input_scale.max(),

From 18bd7587b78b3b9868fea29d59ae8c3600c3e5a5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 1 Nov 2024 13:51:57 -0700
Subject: [PATCH 18/85] [1/N] pass the complete config from engine to executor
 (#9933)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/engine/async_llm_engine.py       |  2 +-
 vllm/engine/llm_engine.py             | 50 +++++++++------------
 vllm/engine/multiprocessing/engine.py |  7 +--
 vllm/executor/executor_base.py        | 37 ++++++----------
 vllm/executor/xpu_executor.py         | 44 ++++---------------
 vllm/v1/engine/llm_engine.py          | 62 +++++++++------------------
 6 files changed, 65 insertions(+), 137 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5198467a6ac40..6aeaf484a22b4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -680,7 +680,7 @@ def from_engine_args(
 
         # Create the async LLM engine.
         engine = cls(
-            **engine_config.to_dict(),
+            vllm_config=engine_config,
             executor_class=executor_class,
             log_requests=not engine_args.disable_log_requests,
             log_stats=not engine_args.disable_log_stats,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index edef1f30a9e91..e6fe1effb8287 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -13,11 +13,8 @@
 from typing_extensions import TypeIs, TypeVar
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
-                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
@@ -222,17 +219,7 @@ def validate_outputs(
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        decoding_config: Optional[DecodingConfig],
-        observability_config: Optional[ObservabilityConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
+        vllm_config: EngineConfig,
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -240,6 +227,22 @@ def __init__(
         input_registry: InputRegistry = INPUT_REGISTRY,
         use_cached_outputs: bool = False,
     ) -> None:
+
+        # TODO: remove the local variables and use self.* throughout the class.
+        model_config = self.model_config = vllm_config.model_config
+        cache_config = self.cache_config = vllm_config.cache_config
+        lora_config = self.lora_config = vllm_config.lora_config
+        parallel_config = self.parallel_config = vllm_config.parallel_config
+        scheduler_config = self.scheduler_config = vllm_config.scheduler_config
+        device_config = self.device_config = vllm_config.device_config
+        speculative_config = self.speculative_config = vllm_config.speculative_config  # noqa
+        load_config = self.load_config = vllm_config.load_config
+        decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        )
+        prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
+        observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
+        )
+
         logger.info(
             "Initializing an LLM engine (v%s) with config: "
             "model=%r, speculative_config=%r, tokenizer=%r, "
@@ -340,18 +343,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
-        self.model_executor = executor_class(
-            model_config=model_config,
-            cache_config=cache_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            lora_config=lora_config,
-            speculative_config=speculative_config,
-            load_config=load_config,
-            prompt_adapter_config=prompt_adapter_config,
-            observability_config=self.observability_config,
-        )
+        self.model_executor = executor_class(vllm_config=vllm_config, )
 
         if self.model_config.task != "embedding":
             self._initialize_kv_caches()
@@ -582,7 +574,7 @@ def from_engine_args(
         executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
-            **engine_config.to_dict(),
+            vllm_config=engine_config,
             executor_class=executor_class,
             log_stats=not engine_args.disable_log_stats,
             usage_context=usage_context,
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 0a7f430eca488..eb1512ca17822 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -7,8 +7,6 @@
 import zmq
 
 from vllm import AsyncEngineArgs, SamplingParams
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
@@ -30,9 +28,6 @@
 else:
     from vllm.engine.llm_engine import LLMEngine
 
-CONFIG_TYPE = Union[ModelConfig, DecodingConfig, ParallelConfig,
-                    SchedulerConfig, LoRAConfig]
-
 logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 10000
@@ -130,7 +125,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
 
         return cls(ipc_path=ipc_path,
                    use_async_sockets=use_async_sockets,
-                   **engine_config.to_dict(),
+                   vllm_config=engine_config,
                    executor_class=executor_class,
                    log_requests=not engine_args.disable_log_requests,
                    log_stats=not engine_args.disable_log_stats,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index c96cb0f2c2981..2248eecd1849f 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,10 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import EngineConfig
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -23,27 +20,19 @@ class ExecutorBase(ABC):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
-        observability_config: Optional[ObservabilityConfig],
+        vllm_config: EngineConfig,
     ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.speculative_config = speculative_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
         self._init_executor()
 
     @abstractmethod
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index 5f78993ddc4b4..36b7e2265efab 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -2,10 +2,7 @@
 
 import torch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import ModelConfig, ParallelConfig
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
@@ -21,38 +18,13 @@ class XPUExecutor(GPUExecutor):
 
     uses_ray: bool = False
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        observability_config: Optional[ObservabilityConfig],
-    ) -> None:
-        assert device_config.device_type == "xpu"
-        assert (not speculative_config
-                ), "Speculative decoding not yet supported for XPU backend"
-
-        model_config = _verify_and_get_model_config(model_config)
-
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.load_config = load_config
-        self.lora_config = lora_config
-        self.parallel_config = _verify_and_get_parallel_config(parallel_config)
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.speculative_config = None
-        self.observability_config = observability_config
-
-        # Instantiate the worker and load the model to GPU.
-        self._init_executor()
+    def _init_executor(self) -> None:
+        assert self.device_config.device_type == "xpu"
+        assert self.speculative_config is None, (
+            "Speculative decoding not yet supported for XPU backend")
+
+        self.model_config = _verify_and_get_model_config(self.model_config)
+        GPUExecutor._init_executor(self)
 
     def _get_worker_module_and_class(
             self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 072e52bcd686a..febabd2f31036 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,11 +2,8 @@
 from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type,
                     Union)
 
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
-                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
@@ -35,17 +32,7 @@ class LLMEngine:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        decoding_config: Optional[DecodingConfig],
-        observability_config: Optional[ObservabilityConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
+        vllm_config: EngineConfig,
         executor_class: Type[GPUExecutor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -53,6 +40,22 @@ def __init__(
         input_registry: InputRegistry = INPUT_REGISTRY,
         use_cached_outputs: bool = False,
     ) -> None:
+
+        # TODO: remove the local variables and use self.* throughout the class.
+        model_config = self.model_config = vllm_config.model_config
+        cache_config = self.cache_config = vllm_config.cache_config
+        lora_config = self.lora_config = vllm_config.lora_config
+        parallel_config = self.parallel_config = vllm_config.parallel_config
+        scheduler_config = self.scheduler_config = vllm_config.scheduler_config
+        device_config = self.device_config = vllm_config.device_config
+        speculative_config = self.speculative_config = vllm_config.speculative_config  # noqa
+        load_config = self.load_config = vllm_config.load_config
+        decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        )
+        prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
+        observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
+        )
+
         # Override the configs for V1.
         # FIXME
         if usage_context == UsageContext.LLM_CLASS:
@@ -112,18 +115,6 @@ def __init__(
             model_config.mm_processor_kwargs,
         )
 
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.speculative_config = speculative_config
-        self.load_config = load_config
-        self.decoding_config = decoding_config or DecodingConfig()
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config or ObservabilityConfig(
-        )
         self.log_stats = log_stats
 
         assert not self.model_config.skip_tokenizer_init
@@ -154,18 +145,7 @@ def __init__(
         # Request id -> RequestOutput
         self.request_outputs: Dict[str, RequestOutput] = {}
 
-        self.model_executor = executor_class(
-            model_config=model_config,
-            cache_config=cache_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            lora_config=lora_config,
-            speculative_config=speculative_config,
-            load_config=load_config,
-            prompt_adapter_config=prompt_adapter_config,
-            observability_config=self.observability_config,
-        )
+        self.model_executor = executor_class(vllm_config=vllm_config)
         assert self.model_config.task != "embedding"
         self._initialize_kv_caches()
 
@@ -203,7 +183,7 @@ def from_engine_args(
         executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
-            **engine_config.to_dict(),
+            vllm_config=engine_config,
             executor_class=executor_class,
             log_stats=not engine_args.disable_log_stats,
             usage_context=usage_context,

From 27cd36e6e2e808464c8343066b03db5db2d15413 Mon Sep 17 00:00:00 2001
From: Gene Der Su <gdsu@ucdavis.edu>
Date: Fri, 1 Nov 2024 15:08:23 -0700
Subject: [PATCH 19/85] [Bugfix] PicklingError on RayTaskError (#9934)

Signed-off-by: Gene Su <e870252314@gmail.com>
---
 vllm/engine/multiprocessing/engine.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index eb1512ca17822..a73b4c825b11c 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -5,6 +5,7 @@
 
 import cloudpickle
 import zmq
+from ray.exceptions import RayTaskError
 
 from vllm import AsyncEngineArgs, SamplingParams
 # yapf conflicts with isort for this block
@@ -305,6 +306,11 @@ def _health_check(self):
     def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
         """Send List of RequestOutput to RPCClient."""
         if outputs:
+            # RayTaskError might not pickelable here. We need to unpack the
+            # underlying exception as the real exception in the output.
+            if (isinstance(outputs, RPCError)
+                    and isinstance(outputs.exception, RayTaskError)):
+                outputs.exception = outputs.exception.cause
             output_bytes = pickle.dumps(outputs)
             self.output_socket.send_multipart((output_bytes, ), copy=False)
 

From d151fde8341d34592e1e5e14d2152d067421cf63 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 1 Nov 2024 23:04:42 +0000
Subject: [PATCH 20/85] [ci/build] Bump the patch-update group with 10 updates
 (#9897)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
---
 requirements-lint.txt |  2 +-
 requirements-test.in  |  2 +-
 requirements-test.txt | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements-lint.txt b/requirements-lint.txt
index 07f738873e1a8..f9132bbf96437 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,7 +1,7 @@
 # formatting
 yapf==0.32.0
 toml==0.10.2
-tomli==2.0.1
+tomli==2.0.2
 ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
diff --git a/requirements-test.in b/requirements-test.in
index 3881f2566b556..5d44664c082a6 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -32,6 +32,6 @@ aiohttp
 
 # quantization
 bitsandbytes>=0.44.0
-buildkite-test-collector==0.1.8
+buildkite-test-collector==0.1.9
 
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index c474c2ec34b22..7477b7c3a79cd 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -36,20 +36,20 @@ attrs==24.2.0
     #   referencing
 audioread==3.0.1
     # via librosa
-awscli==1.35.16
+awscli==1.35.19
     # via -r requirements-test.in
 bitsandbytes==0.44.1
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
-boto3==1.35.50
+boto3==1.35.53
     # via tensorizer
-botocore==1.35.50
+botocore==1.35.53
     # via
     #   awscli
     #   boto3
     #   s3transfer
-buildkite-test-collector==0.1.8
+buildkite-test-collector==0.1.9
     # via -r requirements-test.in
 certifi==2024.8.30
     # via
@@ -426,7 +426,7 @@ requests==2.32.3
     #   transformers
 rouge-score==0.1.2
     # via lm-eval
-rpds-py==0.20.0
+rpds-py==0.20.1
     # via
     #   jsonschema
     #   referencing
@@ -552,7 +552,7 @@ xxhash==3.5.0
     # via
     #   datasets
     #   evaluate
-yarl==1.17.0
+yarl==1.17.1
     # via aiohttp
 zstandard==0.23.0
     # via lm-eval

From 6c0b7f548d80b5f61bfa472ad1497597c922dbc2 Mon Sep 17 00:00:00 2001
From: Peter Salas <peter@fixie.ai>
Date: Fri, 1 Nov 2024 16:21:10 -0700
Subject: [PATCH 21/85] [Core][VLM] Add precise multi-modal placeholder
 tracking (#8346)

Signed-off-by: Peter Salas <peter@fixie.ai>
---
 examples/offline_inference_audio_language.py  |   6 +-
 tests/kernels/utils.py                        |   2 +
 .../audio_language/test_ultravox.py           |  91 ++++++--
 tests/multimodal/test_processor_kwargs.py     |  14 +-
 tests/multimodal/test_utils.py                |  57 ++++-
 tests/worker/test_model_input.py              |   3 +
 vllm/attention/backends/abstract.py           |  11 +
 vllm/attention/backends/blocksparse_attn.py   |   3 +
 vllm/attention/backends/flash_attn.py         |  20 ++
 vllm/attention/backends/flashinfer.py         |  18 ++
 vllm/attention/backends/placeholder_attn.py   |  22 +-
 vllm/attention/backends/rocm_flash_attn.py    |   3 +
 vllm/attention/backends/utils.py              |  18 ++
 vllm/attention/backends/xformers.py           |   3 +
 vllm/core/scheduler.py                        |   2 +
 vllm/inputs/__init__.py                       |   3 +-
 vllm/inputs/data.py                           |  11 +-
 vllm/inputs/registry.py                       |  40 ++--
 vllm/model_executor/models/blip.py            |  10 +-
 vllm/model_executor/models/blip2.py           |  15 +-
 vllm/model_executor/models/chameleon.py       |  22 +-
 vllm/model_executor/models/clip.py            |  32 ++-
 vllm/model_executor/models/fuyu.py            |  31 ++-
 vllm/model_executor/models/internvl.py        |   8 +-
 vllm/model_executor/models/llava.py           |  15 +-
 vllm/model_executor/models/llava_next.py      |  11 +-
 .../model_executor/models/llava_next_video.py |  25 +-
 vllm/model_executor/models/llava_onevision.py |  21 +-
 vllm/model_executor/models/minicpmv.py        |   6 +-
 vllm/model_executor/models/mllama.py          |   7 +-
 vllm/model_executor/models/paligemma.py       |   8 +-
 vllm/model_executor/models/phi3v.py           |   8 +-
 vllm/model_executor/models/pixtral.py         |  34 ++-
 vllm/model_executor/models/qwen.py            |  10 +-
 vllm/model_executor/models/qwen2_audio.py     |  15 +-
 vllm/model_executor/models/qwen2_vl.py        |  11 +-
 vllm/model_executor/models/siglip.py          |  24 +-
 vllm/model_executor/models/ultravox.py        |  60 ++---
 vllm/model_executor/models/utils.py           |  18 +-
 vllm/multimodal/__init__.py                   |   7 +-
 vllm/multimodal/base.py                       | 214 +++++++++++++++++-
 vllm/multimodal/image.py                      |   8 +-
 vllm/multimodal/registry.py                   |  18 +-
 vllm/multimodal/utils.py                      |  21 +-
 vllm/multimodal/video.py                      |  14 +-
 vllm/sequence.py                              |  17 +-
 vllm/worker/cpu_model_runner.py               |  38 +++-
 vllm/worker/enc_dec_model_runner.py           |  30 +--
 vllm/worker/model_runner.py                   |  21 +-
 vllm/worker/model_runner_base.py              |   5 +-
 vllm/worker/openvino_model_runner.py          |  43 +++-
 vllm/worker/tpu_model_runner.py               |   4 +
 vllm/worker/xpu_model_runner.py               |  38 +++-
 53 files changed, 914 insertions(+), 282 deletions(-)

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 37ec667d96a77..050b791b62adb 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -34,11 +34,7 @@ def run_ultravox(question: str, audio_count: int):
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name,
-              enforce_eager=True,
-              enable_chunked_prefill=False,
-              max_model_len=8192,
-              limit_mm_per_prompt={"audio": audio_count})
+    llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index a2d414f636e13..c3d5252edc2a3 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -869,6 +869,7 @@ def make_test_metadata(
         return attn_backend.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
+            multi_modal_placeholder_index_maps=None,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
@@ -914,6 +915,7 @@ def make_test_metadata(
         return attn_backend.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=kv_mmap.slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index b9089e75ffab8..d14e88b4e5b26 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -2,8 +2,10 @@
 
 import numpy as np
 import pytest
+import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
+from tests.utils import RemoteOpenAIServer
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
@@ -17,6 +19,13 @@
 VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
 HF_PLACEHOLDER = "<|audio|>"
 
+CHUNKED_PREFILL_KWARGS = {
+    "enable_chunked_prefill": True,
+    "max_num_seqs": 2,
+    # Use a very small limit to exercise chunked prefill.
+    "max_num_batched_tokens": 16
+}
+
 
 @pytest.fixture(scope="session")
 def audio_assets():
@@ -30,6 +39,26 @@ def audio(request):
     return AudioAsset(request.param)
 
 
+@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+def server(request, audio_assets):
+    args = [
+        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
+        f"--limit-mm-per-prompt=audio={len(audio_assets)}"
+    ] + [
+        f"--{key.replace('_','-')}={value}"
+        for key, value in request.param.items()
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
 def _get_prompt(audio_count, question, placeholder):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     placeholder = f"{placeholder}\n" * audio_count
@@ -68,8 +97,7 @@ def run_test(
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    **kwargs,
 ):
     """Inference result should be the same between hf and vllm."""
     torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -79,11 +107,8 @@ def run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(model, dtype=dtype, enforce_eager=True,
+                     **kwargs) as vllm_model:
         vllm_outputs_per_audio = [
             vllm_model.generate_greedy_logprobs([vllm_prompt],
                                                 max_tokens,
@@ -135,18 +160,16 @@ def run_multi_audio_test(
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    **kwargs,
 ):
     with vllm_runner(model,
                      dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      limit_mm_per_prompt={
                          "audio":
                          max((len(audio) for _, audio in prompts_and_audios))
-                     }) as vllm_model:
+                     },
+                     **kwargs) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             [prompt for prompt, _ in prompts_and_audios],
             max_tokens,
@@ -162,8 +185,9 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
+                num_logprobs: int, vllm_kwargs: dict) -> None:
 
     vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
     hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
@@ -175,7 +199,7 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
+        **vllm_kwargs,
     )
 
 
@@ -183,9 +207,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
 def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
-                                     max_tokens: int,
-                                     num_logprobs: int) -> None:
+                                     max_tokens: int, num_logprobs: int,
+                                     vllm_kwargs: dict) -> None:
 
     vllm_prompt = _get_prompt(len(audio_assets),
                               "Describe each of the audios above.",
@@ -198,5 +223,37 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
+        **vllm_kwargs,
     )
+
+
+@pytest.mark.asyncio
+async def test_online_inference(client, audio_assets):
+    """Exercises online inference with/without chunked prefill enabled."""
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *[{
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio.url
+                }
+            } for audio in audio_assets],
+            {
+                "type":
+                "text",
+                "text":
+                f"What's happening in these {len(audio_assets)} audio clips?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10)
+
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index 5044740c3e734..4d3bbd805c152 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -5,8 +5,8 @@
 import pytest
 import torch
 
-from vllm.inputs import DecoderOnlyInputs, InputContext, token_inputs
-from vllm.inputs.registry import InputRegistry
+from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
+                         InputRegistry, token_inputs)
 from vllm.multimodal import MultiModalRegistry
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
 
@@ -56,7 +56,7 @@ def custom_dummy_data_factory(self,
                                   num_crops=DEFAULT_NUM_CROPS):
         seq_data = SequenceData(
             array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops))
-        return seq_data, None
+        return DummyData(seq_data, None)
 
     with patch(
             "vllm.inputs.registry.InputRegistry._default_dummy_data_factory",
@@ -177,9 +177,9 @@ def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops):
     # NOTE: seq_len is thrown away here since this will leverage the
     # default dummy data factory that we have patched in, whose seq
     # len is solely dependent on the value of the mm_processor_kwargs.
-    seq_data, _ = dummy_registry.dummy_data_for_profiling(
+    dummy_data = dummy_registry.dummy_data_for_profiling(
         ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(seq_data.prompt_token_ids) == expected_seq_count
+    assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count
 
 
 @pytest.mark.parametrize(
@@ -206,9 +206,9 @@ def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
     # NOTE: seq_len is thrown away here since this will leverage the
     # default dummy data factory that we have patched in, whose seq
     # len is solely dependent on the value of the mm_processor_kwargs.
-    seq_data, _ = dummy_registry.dummy_data_for_profiling(
+    dummy_data = dummy_registry.dummy_data_for_profiling(
         ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
+    assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
 
 
 ### Test overrides for the max token count per multimodal instance
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 38cd48629f903..69f04f0a69c0b 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -92,18 +92,50 @@ def test_repeat_and_pad_placeholder_tokens(model):
     tokenizer = AutoTokenizer.from_pretrained(model)
 
     test_cases = [
-        ("<image>", 2, "<image><image>", [32000, 32000]),
-        ("<image><image>", 2, "<image><image><image>", [32000, 32000, 32000]),
-        ("<image><image>", [3, 2], "<image><image><image><image><image>",
-         [32000, 32000, 32000, 32000, 32000]),
-        ("Image:<image>Image:<image>!", [3, 2],
-         "Image:<image><image><image>Image:<image><image>!",
-         [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918]),
-        ("<image>", [3, 2], "<image><image><image>", [32000, 32000, 32000]),
-    ]
-
-    for prompt, repeat_count, expected_prompt, expected_token_ids in test_cases:
-        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+        (
+            "<image>",
+            2,
+            "<image><image>",
+            [32000, 32000],
+            [{ "offset": 0, "length": 2 }],
+        ),
+        (
+            "<image><image>",
+            2,
+            "<image><image><image>",
+            [32000, 32000, 32000],
+            [{ "offset": 0, "length": 2 }]),
+        (
+            "<image><image>",
+            [3, 2],
+            "<image><image><image><image><image>",
+            [32000, 32000, 32000, 32000, 32000],
+            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
+        ),
+        (
+            "Image:<image>Image:<image>!",
+            [3, 2],
+            "Image:<image><image><image>Image:<image><image>!",
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
+        ),
+        (
+            "<image>",
+            [3, 2],
+            "<image><image><image>",
+            [32000, 32000, 32000],
+            [{ "offset": 0, "length": 3 }],
+        ),
+    ]  # yapf: disable
+
+    for (
+            prompt,
+            repeat_count,
+            expected_prompt,
+            expected_token_ids,
+            expected_ranges,
+    ) in test_cases:
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
             tokenizer=tokenizer,
             prompt=prompt,
             prompt_token_ids=tokenizer.encode(prompt,
@@ -113,3 +145,4 @@ def test_repeat_and_pad_placeholder_tokens(model):
         )
         assert new_prompt == expected_prompt
         assert new_token_ids == expected_token_ids
+        assert ranges == expected_ranges
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 1e7f560fc68cc..b36e8bfe73ff3 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -73,6 +73,7 @@ def test_model_runner_input():
         num_prefill_tokens=2,
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
     )
     model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.ones(10),
@@ -124,6 +125,7 @@ def test_embedding_model_runner_input():
         num_prefill_tokens=2,
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
     )
     model_input = ModelInputForGPUWithPoolingMetadata(
         input_tokens=torch.ones(10),
@@ -174,6 +176,7 @@ def test_multi_step_model_runner_input():
         num_prefill_tokens=2,
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
     )
     frozen_model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.ones(10),
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 9ea89eca01f5b..a504cb1f7e318 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -7,6 +7,8 @@
 
 import torch
 
+from vllm.multimodal import MultiModalPlaceholderMap
+
 if TYPE_CHECKING:
     from vllm.worker.model_runner_base import (ModelRunnerBase,
                                                ModelRunnerInputBase,
@@ -108,6 +110,15 @@ class AttentionMetadata:
     # in block 0, and 1st slot in block 1, respectively.
     slot_mapping: torch.Tensor
 
+    # The index maps that relate multi-modal embeddings to the corresponding
+    # placeholders.
+    #
+    # N.B. These aren't really related to attention and don't belong on this
+    # type -- this is just a temporary solution to make them available to
+    # `model_executable`.
+    multi_modal_placeholder_index_maps: Optional[Dict[
+        str, MultiModalPlaceholderMap.IndexMap]]
+
     @property
     @abstractmethod
     def prefill_metadata(self) -> Optional["AttentionMetadata"]:
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index c216d195c9e7e..409a42187f46c 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -215,6 +215,8 @@ def prefill_metadata(
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -243,6 +245,7 @@ def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_query_len=None,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c294fcf7f08fe..ab363ac78b028 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 """Attention layer with FlashAttention."""
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
@@ -14,6 +15,7 @@
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
 from vllm.forward_context import get_forward_context
+from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
                         make_tensor_with_pad)
 
@@ -169,6 +171,8 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -198,6 +202,7 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_decode_query_len=self.max_decode_query_len,
@@ -297,6 +302,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.context_lens: List[int] = []
         self.block_tables: List[List[int]] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -327,6 +335,12 @@ def _add_seq_group(
             self.context_lens.append(context_len)
 
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -449,6 +463,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -464,6 +483,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_decode_query_len=max_decode_query_len,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 658805d35be0a..107e3bbf79666 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,7 +1,10 @@
+from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
 
+from vllm.multimodal import MultiModalPlaceholderMap
+
 try:
     from flashinfer import BatchDecodeWithPagedKVCacheWrapper
     from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
@@ -215,6 +218,7 @@ def graph_capture_get_metadata_for_batch(
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,
             slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
             num_prefill_tokens=0,
             num_decode_tokens=batch_size,
             max_prefill_seq_len=0,
@@ -470,6 +474,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.context_lens: List[int] = []
         self.block_tables: List[List[int]] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -519,6 +526,11 @@ def _add_seq_group(
                  inter_data.curr_sliding_window_blocks):
             self.context_lens.append(context_len)
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -651,6 +663,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -694,6 +711,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             decode_query_len=decode_query_len,
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             max_prefill_seq_len=max_prefill_seq_len,
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 4116fbf00020c..888adbffb8578 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,5 +1,6 @@
+from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
 
 import torch
 
@@ -7,6 +8,7 @@
                                               AttentionMetadata,
                                               AttentionMetadataBuilder)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.multimodal import MultiModalPlaceholderMap
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUBuilder
@@ -135,6 +137,8 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_decode_query_len=0,
@@ -167,6 +171,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_decode_query_len=self.max_decode_query_len,
@@ -189,6 +194,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.prefill_seq_lens: List[int] = []
         self.context_lens: List[int] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -213,6 +221,12 @@ def _add_seq_group(
             self.context_lens.append(context_len)
 
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -280,6 +294,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -296,6 +315,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         return PlaceholderAttentionMetadata(
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 30859dfa60634..b129d0d992f2f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -150,6 +150,8 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -178,6 +180,7 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_query_len=None,
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 32fccd0dfb496..55293bbb06e1d 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,4 +1,5 @@
 """Attention backend utils"""
+from collections import defaultdict
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Dict, List, Type, TypeVar, Union
 
@@ -7,6 +8,7 @@
 
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
+from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
 if TYPE_CHECKING:
@@ -123,6 +125,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.context_lens: List[int] = []
         self.block_tables: List[List[int]] = []
         self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
@@ -147,6 +152,12 @@ def _add_seq_group(
                  inter_data.curr_sliding_window_blocks):
             self.context_lens.append(context_len)
             if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
                 self.num_prefills += 1
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
@@ -242,6 +253,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -254,6 +270,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         return self._metadata_cls(  # type: ignore
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
@@ -305,6 +322,7 @@ def graph_capture_get_metadata_for_batch(
             num_prefill_tokens=0,
             num_decode_tokens=batch_size,
             slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
             seq_lens=None,
             seq_lens_tensor=self._graph_seq_lens[:batch_size],
             max_query_len=1,
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 5aaf13d8ea744..21877f2dded0e 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -212,6 +212,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
@@ -255,6 +257,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]:
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens_tensor=seq_lens_tensor,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index e35c05f4fe7f7..e56d5cddce424 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1308,6 +1308,8 @@ def schedule(
                     # `multi_modal_data` will be None.
                     multi_modal_data=seq_group.multi_modal_data
                     if scheduler_outputs.num_prefill_groups > 0 else None,
+                    multi_modal_placeholders=seq_group.multi_modal_placeholders
+                    if scheduler_outputs.num_prefill_groups > 0 else None,
                     mm_processor_kwargs=seq_group.mm_processor_kwargs,
                     prompt_adapter_request=seq_group.prompt_adapter_request,
                 )
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 7b73922ddd2c5..ac7b3ca28b406 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -3,7 +3,7 @@
                    SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
                    build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
                    token_inputs, zip_enc_dec_prompts)
-from .registry import InputContext, InputRegistry
+from .registry import DummyData, InputContext, InputRegistry
 
 INPUT_REGISTRY = InputRegistry()
 """
@@ -29,6 +29,7 @@
     "to_enc_dec_tuple_list",
     "zip_enc_dec_prompts",
     "INPUT_REGISTRY",
+    "DummyData",
     "InputContext",
     "InputRegistry",
 ]
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 9a094191eda38..ba393cbcce4eb 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@
 from typing_extensions import NotRequired, TypedDict, TypeVar
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalDataDict
+    from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 
 
 class TextPrompt(TypedDict):
@@ -136,6 +136,12 @@ class TokenInputs(TypedDict):
     if the model supports it.
     """
 
+    multi_modal_placeholders: NotRequired[
+        Optional["MultiModalPlaceholderDict"]]
+    """
+    Placeholder ranges for the multi-modal data.
+    """
+
     mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
@@ -149,6 +155,7 @@ def token_inputs(
     prompt_token_ids: List[int],
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
+    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
     """Construct :class:`TokenInputs` from optional values."""
@@ -158,6 +165,8 @@ def token_inputs(
         inputs["prompt"] = prompt
     if multi_modal_data is not None:
         inputs["multi_modal_data"] = multi_modal_data
+    if multi_modal_placeholders is not None:
+        inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
         inputs["mm_processor_kwargs"] = mm_processor_kwargs
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 4cebc91ce715c..fbf912a212568 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,8 +1,8 @@
 import functools
 from collections import UserDict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
-                    Protocol, Tuple, Type)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
+                    Optional, Protocol, Type)
 
 from torch import nn
 from transformers import PretrainedConfig
@@ -16,7 +16,8 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
-    from vllm.multimodal import MultiModalDataDict, MultiModalRegistry
+    from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
+                                 MultiModalRegistry)
     from vllm.sequence import SequenceData
 
 logger = init_logger(__name__)
@@ -63,6 +64,14 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
 N = TypeVar("N", bound=Type[nn.Module])
 
 
+class DummyData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    seq_data: "SequenceData"
+    multi_modal_data: Optional["MultiModalDataDict"] = None
+    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
+
+
 class DummyDataFactory(Protocol):
 
     def __call__(
@@ -71,7 +80,7 @@ def __call__(
         seq_len: int,
         mm_counts: Mapping[str, int],
         **mm_processor_kwargs: Any,
-    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+    ) -> DummyData:
         """
         Create dummy data to be inputted into the model.
 
@@ -123,7 +132,7 @@ def _default_dummy_data_factory(
         ctx: InputContext,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+    ) -> DummyData:
         """
         The default dummy data factory represents the longest possible text
         that can be inputted to the model.
@@ -134,10 +143,7 @@ def _default_dummy_data_factory(
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        dummy_seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
-        dummy_multi_modal_data = None
-
-        return dummy_seq_data, dummy_multi_modal_data
+        return DummyData(SequenceData.from_prompt_token_counts((0, seq_len)))
 
     def register_dummy_data(self, factory: DummyDataFactory):
         """
@@ -195,7 +201,7 @@ def dummy_data_for_profiling(
         seq_len: int,
         mm_registry: "MultiModalRegistry",
         is_encoder_data: bool = False,
-    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+    ) -> DummyData:
         """
         Create dummy data for profiling the memory usage of a model.
 
@@ -220,12 +226,12 @@ def dummy_data_for_profiling(
         mm_processor_kwargs = get_allowed_kwarg_only_overrides(
             dummy_factory, overrides=model_config.mm_processor_kwargs)
 
-        seq_data, mm_data = dummy_factory(InputContext(model_config), seq_len,
-                                          _MultiModalCounts(mm_counts),
-                                          **mm_processor_kwargs)
+        dummy_data = dummy_factory(InputContext(model_config), seq_len,
+                                   _MultiModalCounts(mm_counts),
+                                   **mm_processor_kwargs)
 
         # Having more tokens is over-conservative but otherwise fine
-        num_tokens = seq_data.prompt_token_ids
+        num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
                 print_warning_once(
@@ -235,15 +241,15 @@ def dummy_data_for_profiling(
                 raise AssertionError(
                     f"Expected at least {seq_len} dummy tokens for profiling, "
                     f"but found {len(num_tokens)} tokens instead.")
-        if mm_data is not None:
-            for k, v in mm_data.items():
+        if dummy_data.multi_modal_data is not None:
+            for k, v in dummy_data.multi_modal_data.items():
                 num_items = len(v) if isinstance(v, list) else 1
                 num_expected = mm_counts[k]
                 assert num_items >= num_expected, (
                     f"Expected at least {num_expected} dummy '{k}' instances "
                     f"for profiling, but found {num_items} instances instead.")
 
-        return seq_data, mm_data
+        return dummy_data
 
     def _default_input_processor(
         self,
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 1f2d7384076ed..e612010677364 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -98,6 +98,11 @@ def input_processor_for_blip(
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
     if image_feature_size_override is None:
@@ -105,7 +110,7 @@ def input_processor_for_blip(
     else:
         image_feature_size = image_feature_size_override
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -116,7 +121,8 @@ def input_processor_for_blip(
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": ranges})
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index c3b3cc8a4ddb6..db1f92649bd49 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -9,13 +9,14 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .blip import (BlipVisionModel, dummy_image_for_blip,
@@ -425,7 +426,11 @@ def dummy_seq_data_for_blip2(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
@@ -434,7 +439,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
     vision_config = hf_config.vision_config
     num_images = mm_counts["image"]
 
-    seq_data = dummy_seq_data_for_blip2(
+    seq_data, ranges = dummy_seq_data_for_blip2(
         hf_config,
         seq_len,
         num_images,
@@ -444,7 +449,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
     if isinstance(vision_config, Blip2VisionConfig):
         mm_data = dummy_image_for_blip(vision_config, num_images)
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index aaf559ca386cc..9f6c6786c0fa4 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,8 +11,8 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -30,6 +30,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.utils import print_warning_once
@@ -73,7 +74,11 @@ def dummy_seq_data_for_chameleon(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_chameleon(
@@ -97,14 +102,14 @@ def dummy_data_for_chameleon(ctx: InputContext, seq_len: int,
                              mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
 
-    seq_data = dummy_seq_data_for_chameleon(
+    seq_data, ranges = dummy_seq_data_for_chameleon(
         seq_len,
         num_images,
         image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
     )
 
     mm_data = dummy_image_for_chameleon(num_images)
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 def input_processor_for_chameleon(ctx: InputContext,
@@ -120,9 +125,14 @@ def input_processor_for_chameleon(ctx: InputContext,
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index a3293020c042e..2d81b9266826b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
@@ -49,14 +50,13 @@ def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
     return get_clip_image_feature_size(hf_config)
 
 
-def dummy_seq_data_for_clip(
-    hf_config: CLIPVisionConfig,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
+def dummy_seq_data_for_clip(hf_config: CLIPVisionConfig,
+                            seq_len: int,
+                            num_images: int,
+                            *,
+                            image_token_id: int,
+                            image_feature_size_override: Optional[int] = None,
+                            mm_key: str = "image"):
     if image_feature_size_override is None:
         image_feature_size = get_clip_image_feature_size(hf_config)
     else:
@@ -65,7 +65,11 @@ def dummy_seq_data_for_clip(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        mm_key:
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_clip(
@@ -117,6 +121,11 @@ def input_processor_for_clip(
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
     if image_feature_size_override is None:
@@ -130,7 +139,7 @@ def input_processor_for_clip(
     else:
         image_feature_size = image_feature_size_override
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -141,7 +150,8 @@ def input_processor_for_clip(
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": ranges})
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 358d1dd288c49..0de590d1d8372 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -27,8 +27,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -37,9 +37,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
+from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
@@ -103,7 +105,11 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
     token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images
     token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
                        [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids)
+    return SequenceData(token_ids), {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_fuyu(
@@ -119,15 +125,15 @@ def dummy_image_for_fuyu(
 def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
                         mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
-    seq_data = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
+    seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
     mm_data = dummy_image_for_fuyu(num_images,
                                    image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
                                    image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
-                           data: Image.Image):
+                           data: List[Image.Image]):
     image_encoding = image_processor.preprocess(data, return_tensors="pt")
     batch_images = torch.stack([img[0] for img in image_encoding["images"]
                                 ]).unsqueeze(1)
@@ -158,8 +164,10 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
     model_config = ctx.model_config
     image_data = multi_modal_data["image"]
     new_multi_modal_data = {}
+    image_list = image_data if isinstance(image_data, list) else [image_data]
+
     # process image data
-    if isinstance(image_data, Image.Image):
+    if is_list_of(image_list, Image.Image):
         # Fuyu's image_processor can also finish token padding
         image_processor: FuyuImageProcessor = cached_get_image_processor(
             model_config.model)
@@ -171,7 +179,7 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
         ])
         new_multi_modal_data["image"] = image_patches
 
-    elif isinstance(image_data, torch.Tensor):
+    elif is_list_of(image_list, torch.Tensor):
         raise NotImplementedError("Embeddings input is not supported yet")
     else:
         raise TypeError(f"Invalid image type: {type(image_data)}")
@@ -198,12 +206,13 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
 
 def input_mapper_for_fuyu(ctx: InputContext, data: object):
     model_config = ctx.model_config
-    if isinstance(data, Image.Image):
+    data_list = data if isinstance(data, list) else [data]
+    if is_list_of(data_list, Image.Image):
         # Fuyu's image_processor can also finish token padding
         image_processor: FuyuImageProcessor = cached_get_image_processor(
             model_config.model)
 
-        model_image_input = _fuyu_image_preprocess(image_processor, data)
+        model_image_input = _fuyu_image_preprocess(image_processor, data_list)
         data = torch.stack([
             image_patch[0]
             for image_patch in model_image_input["image_patches"]
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 1c1fde5b30983..d2ec0ff6e74c6 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -17,8 +17,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import (AWQConfig,
                                                      QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -379,7 +379,7 @@ def dummy_data(
             model_config.tokenizer,
             trust_remote_code=model_config.trust_remote_code)
 
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             hf_config.vision_config,
             seq_len,
             num_images,
@@ -398,7 +398,7 @@ def dummy_data(
             image_height_override=max_image_height,
         )
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
 
 input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 27055e7ced865..7fbd59ebd98fd 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -10,7 +10,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -111,7 +112,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
     image_feature_size = get_max_llava_image_tokens(ctx)
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_images,
@@ -120,9 +121,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
         )
 
         mm_data = dummy_image_for_clip(vision_config, num_images)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_images,
@@ -131,9 +132,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
         )
 
         mm_data = dummy_image_for_siglip(vision_config, num_images)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, PixtralVisionConfig):
-        seq_data = dummy_seq_data_for_pixtral_hf(
+        seq_data, ranges = dummy_seq_data_for_pixtral_hf(
             vision_config,
             seq_len,
             num_images,
@@ -142,7 +143,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
         )
 
         mm_data = dummy_image_for_pixtral_hf(vision_config, num_images)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e8540d85ff565..e8c5786066170 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -12,7 +12,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig, PoolerConfig
-from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext)
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -180,7 +181,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
     max_feat_height, max_feat_width = pinpoint
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_images,
@@ -195,9 +196,9 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
             image_height_override=max_feat_height,
         )
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_images,
@@ -212,7 +213,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
             image_height_override=max_feat_height,
         )
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index b8051d5fc6ae2..b755e2347f6ed 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -11,8 +11,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -108,33 +108,35 @@ def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
     video_feature_size = frames_per_video * tokens_per_frame
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
+            mm_key="video",
         )
 
         pil_frame = dummy_image_for_clip(vision_config, num_images=1)
         np_frame = np.array(pil_frame["image"])
         mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
         mm_data = {"video": mm_data_per_video}
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
+            mm_key="video",
         )
 
         pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
         np_frame = np.array(pil_frame["image"])
         mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
         mm_data = {"video": mm_data_per_video}
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
@@ -145,6 +147,12 @@ def input_processor_for_llava_next_video(ctx: InputContext,
     multi_modal_data = inputs.get("multi_modal_data")
     if multi_modal_data is None or "video" not in multi_modal_data:
         return inputs
+
+    if "multi_modal_placeholders" in inputs and "video" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     video_data = multi_modal_data["video"]
 
     model_config = ctx.model_config
@@ -160,7 +168,7 @@ def input_processor_for_llava_next_video(ctx: InputContext,
 
         tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
-        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
             tokenizer,
             inputs.get("prompt"),
             inputs["prompt_token_ids"],
@@ -170,7 +178,8 @@ def input_processor_for_llava_next_video(ctx: InputContext,
 
         return token_inputs(prompt_token_ids=new_token_ids,
                             prompt=new_prompt,
-                            multi_modal_data=multi_modal_data)
+                            multi_modal_data=multi_modal_data,
+                            multi_modal_placeholders={"video": ranges})
 
     elif is_list_of(video_data, np.ndarray):
         raise NotImplementedError(
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index a0cf208a65f36..f410d64577a77 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -15,8 +15,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -218,31 +218,31 @@ def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int,
     video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = dummy_seq_data_for_clip(
+        seq_data, ranges = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
-        )
+            mm_key="video")
 
         mm_data = dummy_video_for_clip(vision_config,
                                        num_frames=num_frames,
                                        num_videos=num_videos)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data = dummy_seq_data_for_siglip(
+        seq_data, ranges = dummy_seq_data_for_siglip(
             vision_config,
             seq_len,
             num_videos,
             image_token_id=hf_config.video_token_index,
             image_feature_size_override=video_feature_size,
-        )
+            mm_key="video")
 
         mm_data = dummy_video_for_siglip(vision_config,
                                          num_frames=num_frames,
                                          num_videos=num_videos)
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data, ranges)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
@@ -320,7 +320,7 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
         video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
         tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
-        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
             tokenizer,
             inputs.get("prompt"),
             inputs["prompt_token_ids"],
@@ -330,7 +330,8 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
 
         return token_inputs(prompt_token_ids=new_token_ids,
                             prompt=new_prompt,
-                            multi_modal_data=multi_modal_data)
+                            multi_modal_data=multi_modal_data,
+                            multi_modal_placeholders={"video": ranges})
 
     elif is_list_of(video_data, np.ndarray):
         video_feature_size = []
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 4917c33136069..a526a5dccd398 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -36,8 +36,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
@@ -277,7 +277,7 @@ def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int,
     seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images)
     mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images)
 
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data)
 
 
 def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 5cf5272cae878..19c3827e43703 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -36,7 +36,7 @@
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          EncoderDecoderInputs, InputContext)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -176,13 +176,14 @@ def dummy_image(num_images: int, ):
 def dummy_decoder_data_for_mllama(ctx: InputContext, seq_len: int,
                                   mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
-    return dummy_decoder_seq_data(seq_len, num_images), None
+    return DummyData(dummy_decoder_seq_data(seq_len, num_images))
 
 
 def dummy_encoder_data_for_mllama(ctx: InputContext, seq_len: int,
                                   mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
-    return dummy_encoder_seq_data(ctx, num_images), dummy_image(num_images)
+    return DummyData(dummy_encoder_seq_data(ctx, num_images),
+                     dummy_image(num_images))
 
 
 def _prepare_aspect_ratio_attention_mask(
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 8e29c6079b994..4b6061e113cb2 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -7,8 +7,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -58,7 +58,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
     vision_config = hf_config.vision_config
     num_images = mm_counts["image"]
 
-    seq_data = dummy_seq_data_for_siglip(
+    seq_data, ranges = dummy_seq_data_for_siglip(
         vision_config,
         seq_len,
         num_images,
@@ -66,7 +66,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
     )
 
     mm_data = dummy_image_for_siglip(vision_config, num_images)
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 def input_processor_for_paligemma(ctx: InputContext,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4928e447d5b9e..5b477a8ed5f49 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -28,8 +28,8 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import (CacheConfig, ModelConfig, MultiModalConfig,
                          PoolerConfig)
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -380,7 +380,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
 
     image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops)
 
-    seq_data = dummy_seq_data_for_clip(
+    seq_data, ranges = dummy_seq_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         seq_len,
         num_images,
@@ -394,7 +394,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
         image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
 
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data, ranges)
 
 
 @lru_cache
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6b53bf5660096..051454c49bff8 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -17,8 +17,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -28,7 +28,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import is_list_of
@@ -81,7 +82,12 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     )
 
     mm_data = {"image": num_images * [image]}
-    return seq_data, mm_data
+    mm_placeholders = {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
+    return DummyData(seq_data, mm_data, mm_placeholders)
 
 
 def input_mapper_for_pixtral(ctx: InputContext,
@@ -630,13 +636,13 @@ def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
 
 
 def dummy_seq_data_for_pixtral_hf(
-    hf_config: PixtralVisionConfig,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
+        hf_config: PixtralVisionConfig,
+        seq_len: int,
+        num_images: int,
+        *,
+        image_token_id: int,
+        image_feature_size_override: Optional[int] = None,
+        mm_key: str = "image"):
     if image_feature_size_override is None:
         image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config)
     else:
@@ -645,7 +651,11 @@ def dummy_seq_data_for_pixtral_hf(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        mm_key:
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_pixtral_hf(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 61665768eacf5..b2b5c70182135 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -23,8 +23,8 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -810,7 +810,7 @@ def dummy_data_for_qwen(
     ctx: InputContext,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> Tuple[SequenceData, Optional[Dict]]:
+) -> DummyData:
     """Build dummy data for warming up Qwen models; this will only contain text
     matching the defaults for VLLM unless the model has a visual config.
 
@@ -829,7 +829,7 @@ def dummy_data_for_qwen(
     if not hasattr(hf_config, "visual"):
         seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
         mm_data = None
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data)
 
     # We have a visual component - use images to warm up
     num_images = mm_counts["image"]
@@ -861,7 +861,7 @@ def dummy_data_for_qwen(
     # the data will get resized and the # of tokens per image is constant
     image = Image.new("RGB", (224, 224), color=0)
     mm_data = {"image": image if num_images == 1 else [image] * num_images}
-    return seq_data, mm_data
+    return DummyData(seq_data, mm_data)
 
 
 class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3d049eeb920b7..6114548bda42c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -31,8 +31,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -44,6 +44,7 @@
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -85,7 +86,8 @@ def forward(self, audio_features):
 def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
                                mm_counts: Mapping[str, int]):
     num_audios = mm_counts["audio"]
-    max_llm_audio_tokens = get_max_qwen2_audio_audio_tokens(ctx) * num_audios
+    max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx)
+    max_llm_audio_tokens = max_tokens_per_audio * num_audios
     if seq_len - max_llm_audio_tokens - 2 < 0:
         raise RuntimeError(
             f"Qwen2-Audio cannot process {num_audios} audios in a prompt, "
@@ -99,7 +101,12 @@ def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
         (0, seq_len - max_llm_audio_tokens),
     )
     dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
-    return dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}
+    return DummyData(
+        dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
+            "audio":
+            consecutive_placeholder_ranges(num_items=num_audios,
+                                           item_size=max_tokens_per_audio)
+        })
 
 
 def get_processor(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1e12c2332b65e..d801903f8f9fe 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -44,8 +44,8 @@
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU
@@ -744,9 +744,10 @@ def dummy_data_for_qwen2_vl(
     dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
                             color=0)
 
-    return dummy_seqdata, {
-        "image": dummy_image if num_images == 1 else [dummy_image] * num_images
-    }
+    return DummyData(dummy_seqdata, {
+        "image":
+        dummy_image if num_images == 1 else [dummy_image] * num_images
+    })
 
 
 def _get_llm_num_vision_tokens(
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2e7ae32055aaf..acaf4afdecfe5 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -23,6 +23,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
@@ -61,6 +62,7 @@ def dummy_seq_data_for_siglip(
     *,
     image_token_id: int,
     image_feature_size_override: Optional[int] = None,
+    mm_key: str = "image",
 ):
     if image_feature_size_override is None:
         image_feature_size = get_siglip_image_feature_size(hf_config)
@@ -70,7 +72,11 @@ def dummy_seq_data_for_siglip(
     return SequenceData.from_prompt_token_counts(
         (image_token_id, image_feature_size * num_images),
         (0, seq_len - image_feature_size * num_images),
-    )
+    ), {
+        mm_key:
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
 
 
 def dummy_image_for_siglip(
@@ -122,6 +128,11 @@ def input_processor_for_siglip(
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "image" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
     if image_feature_size_override is None:
@@ -135,7 +146,7 @@ def input_processor_for_siglip(
     else:
         image_feature_size = image_feature_size_override
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -144,11 +155,10 @@ def input_processor_for_siglip(
     )
 
     # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data=multi_modal_data,
-    )
+    return token_inputs(prompt_token_ids=new_token_ids,
+                        prompt=new_prompt,
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": ranges})
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index f08e4aa355086..749750fc9c16e 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,7 +2,6 @@
 """PyTorch Ultravox model."""
 
 import math
-from array import array
 from functools import cached_property, lru_cache
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union, cast)
@@ -17,27 +16,27 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import INPUT_REGISTRY
-from vllm.inputs.data import DecoderOnlyInputs, token_inputs
-from vllm.inputs.registry import InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs, NestedTensors
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
+                             NestedTensors)
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
+from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, merge_multimodal_embeddings)
+                    init_vllm_registered_model,
+                    merge_multimodal_embeddings_from_map)
 
 _AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
@@ -46,13 +45,13 @@
 class UltravoxAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
     data: NestedTensors
-    """Shape: `(batch_size, num_audios, 80, M)"""
+    """Shape: `(batch_size, num_audios, 80, M)`"""
 
 
 class UltravoxAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
     data: NestedTensors
-    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`"""
 
 
 UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
@@ -79,17 +78,16 @@ def dummy_seq_data_for_ultravox(
     seq_len: int,
     audio_count: int,
 ):
-    audio_placeholder = array(
-        VLLM_TOKEN_ID_ARRAY_TYPE,
-        [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
+    audio_length = min(get_ultravox_max_audio_tokens(ctx),
+                       seq_len // audio_count)
 
-    # Add a separator between each chunk.
-    audio_token_ids = (audio_placeholder +
-                       array(VLLM_TOKEN_ID_ARRAY_TYPE, [0])) * audio_count
-    other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                            [0]) * (seq_len - len(audio_token_ids))
-
-    return SequenceData(audio_token_ids + other_token_ids)
+    return SequenceData.from_prompt_token_counts(
+        (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count),
+        (0, seq_len - audio_length * audio_count)), {
+            "audio":
+            consecutive_placeholder_ranges(num_items=audio_count,
+                                           item_size=audio_length)
+        }
 
 
 def dummy_audio_for_ultravox(
@@ -107,10 +105,10 @@ def dummy_data_for_ultravox(
     mm_counts: Mapping[str, int],
 ):
     audio_count = mm_counts["audio"]
-    seq_data = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
+    seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
     mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
 
-    return (seq_data, mm_dict)
+    return DummyData(seq_data, mm_dict, ranges)
 
 
 def input_mapper_for_ultravox(ctx: InputContext, data: object):
@@ -164,6 +162,11 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
     if multi_modal_data is None or "audio" not in multi_modal_data:
         return inputs
 
+    if "multi_modal_placeholders" in inputs and "audio" in inputs[
+            "multi_modal_placeholders"]:
+        # The inputs already have placeholders.
+        return inputs
+
     feature_extractor = whisper_feature_extractor(ctx)
     audios = multi_modal_data["audio"]
     if not isinstance(audios, list):
@@ -197,7 +200,7 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
 
-    new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         inputs.get("prompt"),
         inputs["prompt_token_ids"],
@@ -208,7 +211,8 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"audio": ranges})
 
 
 class StackAudioFrames(nn.Module):
@@ -472,9 +476,9 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 inputs_embeds = self.language_model.model.get_input_embeddings(
                     input_ids)
 
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, audio_embeddings,
-                    _AUDIO_PLACEHOLDER_TOKEN)
+                merge_multimodal_embeddings_from_map(
+                    inputs_embeds, audio_embeddings,
+                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
                 input_ids = None
             else:
                 inputs_embeds = None
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 0aecb5d151a45..c6ec1769fc5d1 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.model_loader.loader import build_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import ModelRegistry
-from vllm.multimodal.base import NestedTensors
+from vllm.multimodal.base import MultiModalPlaceholderMap, NestedTensors
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_pin_memory_available
@@ -326,6 +326,22 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
         _embedding_count_expression(inner) for inner in embeddings)
 
 
+def merge_multimodal_embeddings_from_map(
+        inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
+        placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided 
+    placeholder map .
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
+    inputs_embeds[placeholder_map.dest] = flattened_embeddings[
+        placeholder_map.src]
+    return inputs_embeds
+
+
 def _merge_multimodal_embeddings(
     inputs_embeds: torch.Tensor,
     is_multimodal: torch.Tensor,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 489e1e51f05cb..53da2badb9b98 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,6 +1,7 @@
 from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
-                   MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
-                   NestedTensors)
+                   MultiModalDataDict, MultiModalInputs,
+                   MultiModalPlaceholderDict, MultiModalPlaceholderMap,
+                   MultiModalPlugin, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -17,6 +18,8 @@
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalInputs",
+    "MultiModalPlaceholderDict",
+    "MultiModalPlaceholderMap",
     "MultiModalPlugin",
     "NestedTensors",
     "MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 84e71cbf60df7..6b10d0c609f13 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,8 +1,9 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (Any, Callable, Dict, List, Mapping, Optional, Tuple, Type,
-                    TypedDict, TypeVar, Union, cast, final)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping,
+                    NamedTuple, Optional, Tuple, Type, TypedDict, TypeVar,
+                    Union, cast, final)
 
 import numpy as np
 import torch
@@ -11,12 +12,15 @@
 from torch import nn
 from typing_extensions import TypeAlias
 
-from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of,
                         json_map_leaves, resolve_mm_processor_kwargs)
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.sequence import SequenceGroupMetadata
+
 logger = init_logger(__name__)
 
 NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
@@ -151,6 +155,30 @@ class MultiModalDataBuiltins(TypedDict, total=False):
     Read more on that :ref:`here <adding_multimodal_plugin>`.
 """
 
+
+class PlaceholderRange(TypedDict):
+    """
+    Placeholder location information for multi-modal data.
+
+    For example:
+        Prompt: AAAA BBBB What is in these images?
+        Images A and B will have:
+            A: { "offset": 0, "length": 4 }
+            B: { "offset": 5, "length": 4 }
+    """
+
+    offset: int
+    """The start index of the placeholder in the prompt."""
+
+    length: int
+    """The length of the placeholder."""
+
+
+MultiModalPlaceholderDict = Mapping[str, List[PlaceholderRange]]
+"""
+A dictionary containing placeholder ranges.
+"""
+
 MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
                                  MultiModalInputs]
 """
@@ -243,7 +271,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def map_input(self, model_config: ModelConfig,
+    def map_input(self, model_config: "ModelConfig",
                   data: MultiModalData[object],
                   mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs:
         """
@@ -332,7 +360,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
         for profiling the memory usage of a model.
@@ -366,3 +394,179 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
         self._validate_max_multimodal_tokens(max_mm_tokens)
 
         return max_mm_tokens
+
+
+class MultiModalPlaceholderMap:
+    """
+    Relates multi-modal embeddings to their corresponding placeholders.
+    """
+
+    class IndexMap(NamedTuple):
+        src: List[int]
+        dest: List[int]
+
+    src_ranges: List[range]
+    """
+    The indices of the multi-modal embeddings that will replace the
+    corresponding placeholder embeddings pointed to by ``dest_ranges``.
+    """
+
+    src_len: int
+    """
+    The total number of flattened multi-modal embeddings.
+    """
+
+    dest_ranges: List[range]
+    """
+    The indices of the placeholder embeddings that will be replaced by the
+    multimodal embeddings.
+    """
+
+    dest_len: int
+    """
+    The total number of embeddings in the destination tensor.
+    """
+
+    def __init__(self):
+        self.src_ranges = []
+        self.src_len = 0
+        self.dest_ranges = []
+        self.dest_len = 0
+
+    @classmethod
+    def from_seq_group(
+        cls, seq_group: "SequenceGroupMetadata", positions: range
+    ) -> Tuple[Optional[MultiModalDataDict], Dict[str,
+                                                  "MultiModalPlaceholderMap"]]:
+        """
+        Returns the multi-modal items that intersect with the portion of a
+        prompt (``seq_group``) represented by ``positions``, as well as a
+        ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
+        vectors to their corresponding placeholders.
+
+        Consider the following scenarios:
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |.................................|
+
+            images      = [A, B]
+            src_ranges  = [(0, 4), (4, 8)]
+            dest_ranges = [(0, 4), (5, 9)]
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |  .....                          |
+
+            images      = [A, B]
+            src_ranges  = [(2, 4), (4, 6)]
+            dest_ranges = [(0, 2), (3, 5)]
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |     .........                   |
+
+            images      = [B]
+            src_ranges  = [(0, 4)]
+            dest_ranges = [(0, 4)]
+
+           Prompt: |AAAA BBBB What's in these images?|
+        Positions: |          .......................|
+
+            images      = []
+            src_ranges  = []
+            dest_ranges = []
+        """
+        if (not seq_group.multi_modal_data
+                or not seq_group.multi_modal_placeholders):
+            return seq_group.multi_modal_data, {}
+
+        mm_data = {**seq_group.multi_modal_data}
+        placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict(
+            MultiModalPlaceholderMap)
+
+        for modality, placeholders in seq_group.multi_modal_placeholders.items(
+        ):
+            mm_items = mm_data.pop(modality)
+            if not isinstance(mm_items, list):
+                mm_items = [mm_items]
+
+            if positions:
+                intersecting_items = placeholder_maps[
+                    modality].append_items_from_seq_group(
+                        positions, mm_items, placeholders)
+
+                if intersecting_items:
+                    mm_data[modality] = intersecting_items
+
+        return mm_data, placeholder_maps
+
+    def append_items_from_seq_group(
+            self, positions: range, multi_modal_items: List[_T],
+            multi_modal_placeholders: List[PlaceholderRange]) -> List[_T]:
+        """
+        Adds the multi-modal items that intersect ```positions`` to this
+        placeholder map and returns the intersecting items.
+        """
+        intersecting_items = []
+
+        if len(multi_modal_items) != len(multi_modal_placeholders):
+            raise ValueError(
+                "Multi-modal placeholders and items must have the same length."
+            )
+        for placeholder_dict, mm_item in zip(multi_modal_placeholders,
+                                             multi_modal_items):
+            placeholder = range(
+                placeholder_dict["offset"],
+                placeholder_dict["offset"] + placeholder_dict["length"])
+            intersection = range(max(positions.start, placeholder.start),
+                                 min(positions.stop, placeholder.stop))
+
+            if not intersection:
+                # Skip this multi-modal item.
+                continue
+
+            token_embedding_range = range(intersection.start - positions.start,
+                                          intersection.stop - positions.start)
+
+            multimodal_embedding_range = range(
+                intersection.start - placeholder.start + self.src_len,
+                intersection.stop - placeholder.start + self.src_len)
+
+            intersecting_items.append(mm_item)
+            self.dest_ranges.append(token_embedding_range)
+            self.src_ranges.append(multimodal_embedding_range)
+            self.src_len += len(placeholder)
+
+        self.dest_len += len(positions)
+        return intersecting_items
+
+    def extend(self, other: "MultiModalPlaceholderMap"):
+        """
+        Adds the placeholders from another ``MultiModalPlaceholderMap`` to this
+        instance based on the source and destination tensors being
+        concatenated.
+        """
+
+        self.src_ranges.extend(
+            range(self.src_len + r.start, self.src_len + r.stop)
+            for r in other.src_ranges)
+        self.src_len += other.src_len
+        self.dest_ranges.extend(
+            range(self.dest_len + r.start, self.dest_len + r.stop)
+            for r in other.dest_ranges)
+        self.dest_len += other.dest_len
+
+    def index_map(self) -> "IndexMap":
+        """
+        Finalizes the placeholder map into lists of indices that can be used to
+        index the source and destination tensors.
+        """
+
+        src_indices = [i for r in self.src_ranges for i in r]
+        dest_indices = [i for r in self.dest_ranges for i in r]
+
+        if len(src_indices) != len(dest_indices):
+            raise ValueError(
+                f"The number of source ({len(src_indices)}) and destination "
+                f"indices ({len(dest_indices)}) must be the same.")
+
+        return MultiModalPlaceholderMap.IndexMap(src=src_indices,
+                                                 dest=dest_indices)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 5f74bcea65ce2..3f6bb6c8338d2 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,11 +1,10 @@
 from functools import lru_cache
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 from PIL import Image
 from transformers.image_processing_base import BatchFeature
 
-from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_image_processor
@@ -13,6 +12,9 @@
 
 from .base import MultiModalData, MultiModalInputs, MultiModalPlugin
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 cached_get_image_processor = lru_cache(get_image_processor)
@@ -26,7 +28,7 @@ def get_data_key(self) -> str:
 
     def _get_hf_image_processor(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5e9b8bd518de3..bce2f4c6abe5b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,8 +1,7 @@
 import functools
 from collections import UserDict
-from typing import Any, Dict, Mapping, Optional, Sequence
+from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence
 
-from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
 from .audio import AudioPlugin
@@ -11,6 +10,9 @@
 from .image import ImagePlugin
 from .video import VideoPlugin
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 
@@ -20,7 +22,7 @@ class _MultiModalLimits(UserDict):
     when attempting to access a model that does not exist.
     """
 
-    def __getitem__(self, key: ModelConfig) -> Dict[str, int]:
+    def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
         try:
             return super().__getitem__(key)
         except KeyError as exc:
@@ -98,7 +100,7 @@ def register_image_input_mapper(
 
     def map_input(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
         data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
     ) -> MultiModalInputs:
@@ -139,7 +141,7 @@ def map_input(
 
         return MultiModalInputs(merged_dict)
 
-    def create_input_mapper(self, model_config: ModelConfig):
+    def create_input_mapper(self, model_config: "ModelConfig"):
         """
         Create an input mapper (see :meth:`map_input`) for a specific model.
         """
@@ -177,7 +179,7 @@ def register_max_image_tokens(
         """
         return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
-    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
         for profiling the memory usage of a model.
@@ -195,7 +197,7 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
 
     def init_mm_limits_per_prompt(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
     ) -> None:
         """
         Initialize the maximum number of multi-modal input instances for each
@@ -231,7 +233,7 @@ def init_mm_limits_per_prompt(
 
     def get_mm_limits_per_prompt(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3c801464383ad..c5ff552e06099 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -10,7 +10,7 @@
 from vllm.connections import global_http_connection
 from vllm.envs import VLLM_AUDIO_FETCH_TIMEOUT, VLLM_IMAGE_FETCH_TIMEOUT
 from vllm.logger import init_logger
-from vllm.multimodal.base import MultiModalDataDict
+from vllm.multimodal.base import MultiModalDataDict, PlaceholderRange
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
 logger = init_logger(__name__)
@@ -258,7 +258,7 @@ def repeat_and_pad_placeholder_tokens(
     repeat_count: Union[int, List[int]],
     pad_token_left: Optional[int] = None,
     pad_token_right: Optional[int] = None,
-) -> Tuple[Optional[str], List[int]]:
+) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]:
     if isinstance(repeat_count, int):
         repeat_count = [repeat_count]
 
@@ -301,6 +301,7 @@ def repeat_and_pad_placeholder_tokens(
         new_prompt += prompt_parts[-1]
 
     new_token_ids: List[int] = []
+    placeholder_ranges: List[PlaceholderRange] = []
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
@@ -310,6 +311,10 @@ def repeat_and_pad_placeholder_tokens(
                 pad_token_left=pad_token_left,
                 pad_token_right=pad_token_right,
             )
+            placeholder_ranges.append({
+                "offset": len(new_token_ids),
+                "length": len(replacement_ids)
+            })
             new_token_ids.extend(replacement_ids)
             placeholder_token_idx += 1
 
@@ -320,4 +325,14 @@ def repeat_and_pad_placeholder_tokens(
         else:
             new_token_ids.append(token)
 
-    return new_prompt, new_token_ids
+    return new_prompt, new_token_ids, placeholder_ranges
+
+
+def consecutive_placeholder_ranges(num_items: int,
+                                   item_size: int) -> List[PlaceholderRange]:
+    """Returns a list of consecutive PlaceholderRanges of a fixed size"""
+
+    return [
+        PlaceholderRange(offset=i * item_size, length=item_size)
+        for i in range(num_items)
+    ]
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index c3235c4acb6fd..6c2c6720f4276 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,18 +1,19 @@
 from functools import lru_cache
-from typing import Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import numpy as np
 
-from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_video_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_list_of
 
 from .base import MultiModalData, MultiModalInputs
 from .image import ImagePlugin
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 cached_get_video_processor = lru_cache(get_video_processor)
@@ -38,7 +39,7 @@ def get_data_key(self) -> str:
 
     def _get_hf_video_processor(
         self,
-        model_config: ModelConfig,
+        model_config: "ModelConfig",
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
@@ -56,7 +57,10 @@ def _default_input_mapper(
     ) -> MultiModalInputs:
         model_config = ctx.model_config
 
-        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
+        if isinstance(data, list) and len(data) == 1:
+            data = data[0]
+
+        if isinstance(data, np.ndarray):
             video_processor = self._get_hf_video_processor(
                 model_config,
                 mm_processor_kwargs,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ff59f333f00b4..ee547dde45394 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -15,13 +15,13 @@
 
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
     from vllm.inputs import SingletonInputs
-    from vllm.multimodal.base import MultiModalDataDict
 
 VLLM_TOKEN_ID_ARRAY_TYPE = "l"
 
@@ -485,7 +485,7 @@ def prompt_token_ids(self) -> List[int]:
         return cast(List[int], self.inputs.get(prompt_token_ids_key))
 
     @property
-    def multi_modal_data(self) -> "MultiModalDataDict":
+    def multi_modal_data(self) -> MultiModalDataDict:
         inputs = self.inputs
 
         if (inputs.get("multi_modal_data")
@@ -495,11 +495,15 @@ def multi_modal_data(self) -> "MultiModalDataDict":
             )
 
         return cast(
-            "MultiModalDataDict",
+            MultiModalDataDict,
             (inputs.get("multi_modal_data")
              or inputs.get("encoder_multi_modal_data") or {}),
         )
 
+    @property
+    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+        return self.inputs.get("multi_modal_placeholders") or {}
+
     @property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
         return self.inputs.get("mm_processor_kwargs") or {}
@@ -728,9 +732,13 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]:
                 if self.encoder_seq is not None else None)
 
     @property
-    def multi_modal_data(self) -> "MultiModalDataDict":
+    def multi_modal_data(self) -> MultiModalDataDict:
         return self.first_seq.multi_modal_data
 
+    @property
+    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+        return self.first_seq.multi_modal_placeholders
+
     @property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
         return self.first_seq.mm_processor_kwargs
@@ -946,6 +954,7 @@ class SequenceGroupMetadata(
     # "MultiModalDataDict" types. We have to use Any due to msgspec
     # doesn't allow to have union of 2 different dicts.
     multi_modal_data: Optional[Any] = None
+    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     encoder_seq_data: Optional[SequenceData] = None
     cross_block_table: Optional[List[int]] = None
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 5032896600b3b..0c6fcdf03ba9e 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,5 +1,6 @@
 import dataclasses
 import weakref
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
@@ -16,7 +17,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs)
+                             MultiModalInputs, MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
 from vllm.transformers_utils.config import uses_mrope
@@ -148,9 +149,18 @@ def build(self) -> ModelInputForCPU:
             query_lens=seq_lens,
         )
 
-    def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data,
-                                   computed_len: int,
+    def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata,
+                                   seq_data: SequenceData, computed_len: int,
                                    mm_processor_kwargs: Dict[str, Any]):
+
+        # NOTE: mm_data only includes the subset of multi-modal items that
+        # intersect with the current prefill positions.
+        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+            seq_group, range(computed_len, len(seq_data.get_token_ids())))
+
+        if not mm_data:
+            return
+
         mm_kwargs = self.multi_modal_input_mapper(mm_data, mm_processor_kwargs)
 
         # special processing for mrope position deltas.
@@ -179,7 +189,7 @@ def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data,
                     context_len=computed_len,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
-        return mm_kwargs, mrope_positions
+        return mm_kwargs, placeholder_maps, mrope_positions
 
     def _prepare_prompt(
         self,
@@ -194,6 +204,9 @@ def _prepare_prompt(
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
         multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_modal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -210,11 +223,15 @@ def _prepare_prompt(
             input_tokens.extend(prompt_tokens)  # Token ids
 
             mrope_positions = None
-            if (mm_data := seq_group_metadata.multi_modal_data):
-                mm_kwargs, mrope_positions = self._compute_multi_modal_input(
-                    seq_data, mm_data, computed_len,
+            if seq_group_metadata.multi_modal_data:
+                mm_kwargs, placeholder_maps, mrope_positions = self \
+                    ._compute_multi_modal_input(
+                        seq_group_metadata, seq_data, computed_len,
                     seq_group_metadata.mm_processor_kwargs)
                 multi_modal_inputs_list.append(mm_kwargs)
+                for modality, placeholder_map in placeholder_maps.items():
+                    multi_modal_placeholder_maps[modality].extend(
+                        placeholder_map)
 
             # Token position ids
             # NOTE(woosuk): Here we assume that the first token in the prompt
@@ -264,6 +281,11 @@ def _prepare_prompt(
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)  # type: ignore
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            multi_modal_placeholder_maps.items()
+        }
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
@@ -275,6 +297,7 @@ def _prepare_prompt(
             num_decode_tokens=0,
             block_tables=torch.tensor([]),
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
         multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
@@ -366,6 +389,7 @@ def _prepare_decode(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_decode_seq_len=max_decode_seq_len,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 6a00444f5098b..a4b665d71f28a 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -306,13 +306,12 @@ def profile_run(self) -> None:
                        (group_id < max_num_batched_tokens % max_num_seqs))
             batch_size += seq_len
 
-            decoder_seq_data, decoder_dummy_multi_modal_data \
-                = self.input_registry.dummy_data_for_profiling(
-                    self.model_config,
+            decoder_dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
                                           seq_len,
                                           self.mm_registry,
                                           is_encoder_data=False)
-            encoder_seq_data, encoder_dummy_multi_modal_data \
+            encoder_dummy_data \
                 = self.input_registry.dummy_data_for_profiling(
                     self.model_config,
                                          seq_len,
@@ -320,26 +319,31 @@ def profile_run(self) -> None:
                                          is_encoder_data=True)
 
             # Having more tokens is over-conservative but otherwise fine
-            assert len(decoder_seq_data.prompt_token_ids) >= seq_len, (
+            assert len(
+                decoder_dummy_data.seq_data.prompt_token_ids
+            ) >= seq_len, (
                 f"Expected at least {seq_len} dummy tokens for profiling, "
-                f"but got: {len(decoder_seq_data.prompt_token_ids)}")
+                f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}"
+            )
 
-            assert decoder_dummy_multi_modal_data is None or \
-            encoder_dummy_multi_modal_data is None, (
+            assert decoder_dummy_data.multi_modal_data is None or \
+            encoder_dummy_data.multi_modal_data is None, (
                 "Multi-modal data can't be provided in both encoder and decoder"
             )
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
-                seq_data={group_id: decoder_seq_data},
+                seq_data={group_id: decoder_dummy_data.seq_data},
                 sampling_params=sampling_params,
                 block_tables=None,
-                encoder_seq_data=encoder_seq_data,
+                encoder_seq_data=encoder_dummy_data.seq_data,
                 cross_block_table=None,
-                multi_modal_data=decoder_dummy_multi_modal_data
-                or encoder_dummy_multi_modal_data,
-            )
+                multi_modal_data=decoder_dummy_data.multi_modal_data
+                or encoder_dummy_data.multi_modal_data,
+                multi_modal_placeholders=decoder_dummy_data.
+                multi_modal_placeholders
+                or encoder_dummy_data.multi_modal_placeholders)
             seqs.append(seq)
 
         # Run the model with the dummy inputs.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 891637dafbb14..f2123c64c3274 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -40,7 +40,8 @@
 from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalRegistry)
+                             MultiModalInputs, MultiModalPlaceholderMap,
+                             MultiModalRegistry)
 from vllm.platforms import current_platform
 from vllm.prompt_adapter.layers import PromptAdapterMapping
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -242,6 +243,8 @@ def __init__(
 
             # Multi-modal inputs.
             multi_modal_inputs: Optional[MultiModalInputs] = None,
+            multi_modal_placeholder_maps: Optional[Dict[
+                str, MultiModalPlaceholderMap]] = None,
 
             # Whether the prefix cache is hit (prefill only).
             prefix_cache_hit: bool = False,
@@ -361,6 +364,7 @@ def __init__(
 
             self.prompt_adapter_request = prompt_adapter_request
             self.multi_modal_inputs = multi_modal_inputs
+            self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
             self.prefix_cache_hit = prefix_cache_hit
 
             self.n_seqs = len(self.seq_ids)
@@ -635,7 +639,12 @@ def _compute_prompt_adapter_input(
     def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                                    seq_group_metadata: SequenceGroupMetadata):
         """If multi-modal data is given, add it to the input."""
-        mm_data = seq_group_metadata.multi_modal_data
+        # NOTE: mm_data only includes the subset of multi-modal items that
+        # intersect with the current prefill positions.
+        positions = inter_data.input_positions[0]
+        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+            seq_group_metadata,
+            range(positions[0], positions[0] + len(positions)))
         if not mm_data:
             return
 
@@ -643,6 +652,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
             mm_data,
             mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs)
         inter_data.multi_modal_inputs = mm_kwargs
+        inter_data.multi_modal_placeholder_maps = placeholder_maps
 
         # special processing for mrope position deltas.
         if self.runner.model_is_mrope:
@@ -1255,7 +1265,7 @@ def profile_run(self) -> None:
                        (group_id < max_num_batched_tokens % max_num_seqs))
             batch_size += seq_len
 
-            seq_data, dummy_multi_modal_data = self.input_registry \
+            dummy_data = self.input_registry \
                 .dummy_data_for_profiling(self.model_config,
                                           seq_len,
                                           self.mm_registry)
@@ -1263,12 +1273,13 @@ def profile_run(self) -> None:
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
-                seq_data={group_id: seq_data},
+                seq_data={group_id: dummy_data.seq_data},
                 sampling_params=sampling_params,
                 block_tables=None,
                 lora_request=dummy_lora_requests_per_seq[group_id]
                 if dummy_lora_requests_per_seq else None,
-                multi_modal_data=dummy_multi_modal_data,
+                multi_modal_data=dummy_data.multi_modal_data,
+                multi_modal_placeholders=dummy_data.multi_modal_placeholders,
             )
             seqs.append(seq)
 
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 86883cf152449..89d7addb5a8d9 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -46,9 +46,8 @@ def _init_attn_metadata_from_tensor_dict(
     # Extract the fields used to create AttentionMetadata.
     valid_attn_kwargs = {}
     for field in dataclasses.fields(attn_backend.get_metadata_cls()):
-        val = tensor_dict.pop(field.name, None)
-        if val is not None:
-            valid_attn_kwargs[field.name] = val
+        if field.name in tensor_dict:
+            valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
 
     attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
     tensor_dict["attn_metadata"] = attn_metadata
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index a164fbe3393c4..3da738636a59d 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, NamedTuple, Optional, Tuple
+from collections import defaultdict
+from typing import Dict, List, NamedTuple, Optional, Tuple
 
 import openvino as ov
 import torch
@@ -14,7 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.openvino import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs)
+                             MultiModalInputs, MultiModalPlaceholderMap)
 from vllm.sequence import SequenceGroupMetadata
 
 logger = init_logger(__name__)
@@ -115,6 +116,9 @@ def _prepare_model_input(
         past_lens: List[int] = []
         query_lens: List[int] = []
         multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_modal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
 
         subsequence_begins: List[int] = []
         block_indices: List[int] = []
@@ -168,15 +172,6 @@ def _prepare_model_input(
                                     and self.sliding_window is None
                                     and is_prompt)
 
-                mm_data = seq_group_metadata.multi_modal_data
-                if mm_data:
-                    mm_kwargs = self.multi_modal_input_mapper(
-                        mm_data,
-                        mm_processor_kwargs=seq_group_metadata.
-                        mm_processor_kwargs,
-                    )
-                    multi_modal_inputs_list.append(mm_kwargs)
-
                 block_table = seq_group_metadata.block_tables[seq_id]
                 # TODO(sang): Combine chunked prefill and prefix caching by
                 # only allowing multiple of block_size chunk size.
@@ -220,7 +215,8 @@ def _prepare_model_input(
                 query_lens.append(query_len)
 
                 input_tokens.extend(tokens)
-                input_positions.extend(list(range(computed_len, seq_len)))
+                positions_range = range(computed_len, seq_len)
+                input_positions.extend(list(positions_range))
 
                 past_lens.append(computed_len)
                 subsequence_begins.append(subsequence_begins[-1] + query_len)
@@ -233,6 +229,22 @@ def _prepare_model_input(
                     ), "seq_len: {}, computed_len: {}, query_len: {}".format(
                         seq_len, computed_len, query_len)
 
+                if seq_group_metadata.multi_modal_data:
+                    # NOTE: mm_data only includes the subset of multi-modal
+                    # items that intersect with the current prefill positions.
+                    mm_data, placeholder_maps = MultiModalPlaceholderMap \
+                        .from_seq_group(seq_group_metadata, positions_range)
+
+                    mm_kwargs = self.multi_modal_input_mapper(
+                        mm_data,
+                        mm_processor_kwargs=seq_group_metadata.
+                        mm_processor_kwargs)
+                    multi_modal_inputs_list.append(mm_kwargs)
+
+                    for modality, placeholder_map in placeholder_maps.items():
+                        multi_modal_placeholder_maps[modality].extend(
+                            placeholder_map, )
+
         max_query_len = max(query_lens)
         assert max_query_len > 0, "query_lens: {}".format(query_lens)
 
@@ -261,12 +273,19 @@ def _prepare_model_input(
             max_context_len, dtype=torch.int32,
             device=self.device)  # type: ignore
 
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            multi_modal_placeholder_maps.items()
+        }
+
         attn_metadata = self.attn_backend.make_openvino_metadata(
             past_lens=past_lens_tensor,
             subsequence_begins=subsequence_begins_tensor,
             block_indices=block_indices_tensor,
             block_indices_begins=block_indices_begins_tensor,
             max_context_len=max_context_len_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
         multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 87ced7818a676..3792cbc0f730f 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -184,6 +184,7 @@ def _dummy_run(
                 num_prefill_tokens=batch_size * seq_len,
                 num_decode_tokens=0,
                 slot_mapping=slot_mapping,
+                multi_modal_placeholder_index_maps=None,
                 block_tables=None,
                 context_lens=None,
             )
@@ -216,6 +217,7 @@ def _dummy_run(
                 num_prefill_tokens=0,
                 num_decode_tokens=batch_size * seq_len,
                 slot_mapping=slot_mapping,
+                multi_modal_placeholder_index_maps=None,
                 block_tables=block_tables,
                 context_lens=context_lens,
             )
@@ -360,6 +362,7 @@ def _prepare_prompt(
             num_prefill_tokens=0,  # NOTE: This is not used.
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             block_tables=None,
             context_lens=None,
         )
@@ -429,6 +432,7 @@ def _prepare_decode(
             num_prefill_tokens=0,
             num_decode_tokens=batch_size,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             block_tables=block_tables,
             context_lens=context_lens,
         )
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 75a6de3b24ba4..739fe1b3d2c4f 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,6 +1,7 @@
 import dataclasses
 import time
 import weakref
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
                     Type, TypeVar)
@@ -19,7 +20,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalRegistry)
+                             MultiModalInputs, MultiModalPlaceholderMap,
+                             MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
@@ -161,6 +163,9 @@ def _prepare_prompt(
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
         multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_modal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -179,7 +184,21 @@ def _prepare_prompt(
             # Token position ids
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
-            input_positions.extend(list(range(computed_len, seq_len)))
+            positions_range = range(computed_len, seq_len)
+            input_positions.extend(list(positions_range))
+
+            if seq_group_metadata.multi_modal_data:
+                # NOTE: mm_data only includes the subset of multi-modal items
+                # that intersect with the current prefill positions.
+                mm_data, placeholder_maps = MultiModalPlaceholderMap \
+                    .from_seq_group(seq_group_metadata, positions_range)
+
+                mm_kwargs = self.runner.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
+                for modality, placeholder_map in placeholder_maps.items():
+                    multi_modal_placeholder_maps[modality].extend(
+                        placeholder_map)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -220,6 +239,11 @@ def _prepare_prompt(
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)  # type: ignore
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            multi_modal_placeholder_maps.items()
+        }
 
         max_seqlen = max(seq_lens)
         tmp = [0]
@@ -230,6 +254,7 @@ def _prepare_prompt(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
             seq_lens=seq_lens,
             seqlen_q=seqlen_q,
             max_seqlen=max_seqlen,
@@ -313,6 +338,7 @@ def _prepare_decode(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
             seq_lens=seq_lens,
             seqlen_q=torch.tensor([]),
             max_seqlen=0,
@@ -450,7 +476,7 @@ def profile_run(self) -> None:
                        (group_id < max_num_batched_tokens % max_num_seqs))
             batch_size += seq_len
 
-            seq_data, dummy_multi_modal_data = self.input_registry \
+            dummy_data = self.input_registry \
                 .dummy_data_for_profiling(self.model_config,
                                           seq_len,
                                           self.mm_registry)
@@ -458,12 +484,12 @@ def profile_run(self) -> None:
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
-                seq_data={group_id: seq_data},
+                seq_data={group_id: dummy_data.seq_data},
                 sampling_params=sampling_params,
                 block_tables=None,
                 lora_request=None,
-                multi_modal_data=dummy_multi_modal_data,
-            )
+                multi_modal_data=dummy_data.multi_modal_data,
+                multi_modal_placeholders=dummy_data.multi_modal_placeholders)
             seqs.append(seq)
 
         # Run the model with the dummy inputs.

From d522034c85e8f994bbd193514393056232edd247 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 1 Nov 2024 13:56:13 -1000
Subject: [PATCH 22/85] [ci/build] Have dependabot ignore pinned dependencies
 (#9935)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/dependabot.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index a21acd9671eeb..4f54eea564ecb 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -14,6 +14,15 @@ updates:
     reviewers: ["khluu", "simon-mo"]
     allow:
       - dependency-type: "all"
+    ignore:
+      - dependency-name: "torch"
+      - dependency-name: "torchvision"
+      - dependency-name: "xformers"
+      - dependency-name: "lm-format-enforcer"
+      - dependency-name: "gguf"
+      - dependency-name: "compressed-tensors"
+      - dependency-name: "ray[adag]"
+      - dependency-name: "lm-eval"
     groups:
       patch-update:
         applies-to: version-updates

From a78dd3303efac284afc6785eddba5f175285863b Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Fri, 1 Nov 2024 23:22:49 -0700
Subject: [PATCH 23/85] [Encoder Decoder] Add flash_attn kernel support for
 encoder-decoder models (#9559)

---
 tests/encoder_decoder/test_e2e_correctness.py |  88 +++--
 tests/kernels/test_encoder_decoder_attn.py    | 156 ++++++--
 tests/kernels/utils.py                        |  90 ++++-
 .../vision_language/test_florence2.py         |   2 +-
 vllm/attention/backends/flash_attn.py         | 364 +++++++++++++-----
 vllm/attention/backends/utils.py              | 159 +++++++-
 vllm/attention/backends/xformers.py           | 131 ++-----
 vllm/attention/selector.py                    |   2 +-
 vllm/model_executor/models/bart.py            |   2 -
 vllm/utils.py                                 |   4 +-
 vllm/worker/enc_dec_model_runner.py           |  35 +-
 11 files changed, 716 insertions(+), 317 deletions(-)

diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index bef0c515b9073..f2d7e9fd78cf3 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -7,12 +7,18 @@
 import pytest
 from transformers import AutoModelForSeq2SeqLM
 
+from vllm.attention.selector import (_Backend,
+                                     global_force_attn_backend_context_manager)
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import DecoderPromptType
 from ..models.utils import check_logprobs_close
 
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [
+    _Backend.XFORMERS, _Backend.FLASH_ATTN, None
+]
+
 
 def vllm_to_hf_output(
     vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
@@ -29,7 +35,8 @@ def vllm_to_hf_output(
 
 
 @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@@ -48,6 +55,7 @@ def test_encoder_decoder_e2e(
     num_logprobs: int,
     decoder_prompt_type: DecoderPromptType,
     enforce_eager: bool,
+    attn_backend: _Backend,
 ) -> None:
     '''
     End-to-End (E2E) test for the encoder-decoder framework.
@@ -56,43 +64,49 @@ def test_encoder_decoder_e2e(
     implementations to ensure that both implementations produce consistent
     and correct results.
     '''
-    test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type]
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        test_case_prompts = example_encoder_decoder_prompts[
+            decoder_prompt_type]
 
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
+        # Configuration settings for HF baseline
+        hf_kwargs = {
+            "top_k": None,
+            "num_beams": 1,
+            "repetition_penalty": 1.0,
+            "top_p": 1.0,
+            "length_penalty": 1.0,
+            "early_stopping": False,
+            "no_repeat_ngram_size": None,
+            "min_length": 0
+        }
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            test_case_prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-    with vllm_runner(model, dtype=dtype,
-                     enforce_eager=enforce_eager) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            test_case_prompts, max_tokens, num_logprobs)
+        with hf_runner(model, dtype=dtype,
+                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+            hf_outputs = (
+                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                    test_case_prompts,
+                    max_tokens,
+                    num_logprobs,
+                    **hf_kwargs,
+                ))
+        with vllm_runner(model, dtype=dtype,
+                         enforce_eager=enforce_eager) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                test_case_prompts, max_tokens, num_logprobs)
 
-    hf_skip_tokens = (1
-                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
+        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+                          else 0)
 
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=[
-            vllm_to_hf_output(vllm_output, decoder_prompt_type)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-        num_outputs_0_skip_tokens=hf_skip_tokens,
-    )
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, decoder_prompt_type)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            num_outputs_0_skip_tokens=hf_skip_tokens,
+        )
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index bc99c5559d388..a1dd5eeeaa398 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -16,13 +16,13 @@
 from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
                             AttentionType)
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
-from vllm.attention.selector import (_Backend,
+from vllm.attention.selector import (_Backend, get_attn_backend,
                                      global_force_attn_backend_context_manager)
+from vllm.forward_context import set_forward_context
 from vllm.platforms import current_platform
 
 # List of support backends for encoder/decoder models
-LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS]
-
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
 HEAD_SIZES = [64, 256]
 
 NUM_HEADS = [1, 16]
@@ -145,7 +145,8 @@ class that Attention will automatically select when it is constructed.
                              test_pt.num_heads,
                              test_pt.head_size,
                              test_pt.block_size,
-                             device=CUDA_DEVICE)
+                             device=CUDA_DEVICE,
+                             backend=test_pt.backend_name)
     return TestResources(scale, attn_backend, attn, kv_cache)
 
 
@@ -592,6 +593,7 @@ def _run_encoder_attention_test(
     attn: Attention,
     encoder_test_params: PhaseTestParameters,
     attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
 ) -> torch.Tensor:
     '''
     Run encoder attention.
@@ -610,6 +612,8 @@ def _run_encoder_attention_test(
                            (number_of_tokens x num_heads x head_size)
                            query/key/value fields
     * attn_metadata: attention metadata for encoder/decoder-self attention
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
 
     Returns:
     * Attention.forward() applied to packed {query,key,value} and
@@ -619,20 +623,31 @@ def _run_encoder_attention_test(
     attn_type = AttentionType.ENCODER
     packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
-    return attn.forward(packed_qkv.query,
-                        packed_qkv.key,
-                        packed_qkv.value,
-                        torch.tensor([],
-                                     dtype=torch.float32,
-                                     device=packed_qkv.query.device),
-                        attn_metadata,
-                        attn_type=attn_type)
+    with set_forward_context(attn_metadata):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query,
+                            packed_qkv.key,
+                            packed_qkv.value,
+                            torch.tensor([],
+                                         dtype=torch.float32,
+                                         device=packed_qkv.query.device),
+                            attn_metadata,
+                            attn_type=attn_type)
 
 
 def _run_decoder_self_attention_test(
     test_rsrcs: TestResources,
     decoder_test_params: PhaseTestParameters,
     attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
 ) -> torch.Tensor:
     '''
     Run decoder self-attention test.
@@ -650,6 +665,8 @@ def _run_decoder_self_attention_test(
                            query/key/value fields
     * attn_metadata: attention metadata for decoder-self attention
                      (contains KV cache memory-mapping)
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
 
     Returns:
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
@@ -660,12 +677,22 @@ def _run_decoder_self_attention_test(
     kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
-    return attn.forward(packed_qkv.query,
-                        packed_qkv.key,
-                        packed_qkv.value,
-                        kv_cache,
-                        attn_metadata,
-                        attn_type=attn_type)
+    with set_forward_context(attn_metadata):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query,
+                            packed_qkv.key,
+                            packed_qkv.value,
+                            kv_cache,
+                            attn_metadata,
+                            attn_type=attn_type)
 
 
 def _run_encoder_decoder_cross_attention_test(
@@ -673,6 +700,7 @@ def _run_encoder_decoder_cross_attention_test(
     decoder_test_params: PhaseTestParameters,
     cross_test_params: Optional[PhaseTestParameters],
     attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
 ) -> torch.Tensor:
     '''
     Run encoder/decoder cross-attention test.
@@ -701,6 +729,8 @@ def _run_encoder_decoder_cross_attention_test(
                          (number_of_tokens x num_heads x head_size)
                          key/value fields
     * attn_metadata: attention metadata for encoder/decoder-self attention
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
 
     Returns:
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
@@ -718,12 +748,37 @@ def _run_encoder_decoder_cross_attention_test(
         cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
         key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
         value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
-    return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query,
-                        key,
-                        value,
-                        kv_cache,
-                        attn_metadata,
-                        attn_type=attn_type)
+    with set_forward_context(attn_metadata):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query,
+                            key,
+                            value,
+                            kv_cache,
+                            attn_metadata,
+                            attn_type=attn_type)
+
+
+@pytest.fixture(autouse=True)
+def set_reset_environment(attn_backend):
+    # Set the default torch datatype to bfloat16 to enable
+    # testing of the Flash Attention backend. Also clear the
+    # cached value of the backend.
+    default_dtype = torch.get_default_dtype()
+    if attn_backend.name == 'FLASH_ATTN':
+        torch.set_default_dtype(torch.bfloat16)
+    get_attn_backend.cache_clear()
+    yield
+    # Reset the torch datatype to what it was before the test
+    # so as not to impact the remaining tests.
+    torch.set_default_dtype(default_dtype)
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
@@ -773,10 +828,8 @@ def test_encoder_only(
     * max_dec_seq_len: max length of decoder input sequences
     * max_enc_seq_len: max length of encoder input sequences
     '''
-
     # Force Attention wrapper backend
     with global_force_attn_backend_context_manager(attn_backend):
-
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
@@ -807,10 +860,14 @@ def test_encoder_only(
         # PREFILL: encoder attention
 
         enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
-            test_rsrcs.attn, enc_test_params, prephase_attn_metadata))
+            test_rsrcs.attn,
+            enc_test_params,
+            prephase_attn_metadata,
+            test_pt=test_pt))
 
         # - Is encoder attention result correct?
-        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+                                    attn_backend.name)
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
@@ -892,10 +949,8 @@ def test_e2e_enc_dec_attn(
     * max_dec_seq_len: max length of decoder input sequences
     * max_enc_seq_len: max length of encoder input sequences
     '''
-
     # Force Attention wrapper backend
     with global_force_attn_backend_context_manager(attn_backend):
-
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
@@ -955,29 +1010,39 @@ def test_e2e_enc_dec_attn(
 
         enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
                                                        enc_test_params,
-                                                       prephase_attn_metadata)
+                                                       prephase_attn_metadata,
+                                                       test_pt=test_pt)
 
         # - Is encoder attention result correct?
-        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+                                    attn_backend.name)
 
         # PREFILL: decoder self-attention test
 
         prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs, prephase_dec_test_params, prephase_attn_metadata)
+            test_rsrcs,
+            prephase_dec_test_params,
+            prephase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is prefill decoder self-attention correct?
         assert_actual_matches_ideal(prephase_dec_test_params,
-                                    prephase_dec_pckd_act_out)
+                                    prephase_dec_pckd_act_out,
+                                    attn_backend.name)
 
         # PREFILL: encoder/decoder cross-attention test
 
         prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs, prephase_dec_test_params, prephase_cross_test_params,
-            prephase_attn_metadata)
+            test_rsrcs,
+            prephase_dec_test_params,
+            prephase_cross_test_params,
+            prephase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is prefill encoder/decoder cross-attention correct?
         assert_actual_matches_ideal(prephase_cross_test_params,
-                                    prephase_cross_pckd_act_out)
+                                    prephase_cross_pckd_act_out,
+                                    attn_backend.name)
 
         # DECODE: build decode-phase attention metadata
 
@@ -993,17 +1058,26 @@ def test_e2e_enc_dec_attn(
         # DECODE: decoder self-attention test
 
         decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs, decphase_dec_test_params, decphase_attn_metadata)
+            test_rsrcs,
+            decphase_dec_test_params,
+            decphase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is decode-phase decoder self-attention correct?
         assert_actual_matches_ideal(decphase_dec_test_params,
-                                    decphase_dec_pckd_act_out)
+                                    decphase_dec_pckd_act_out,
+                                    attn_backend.name)
 
         # DECODE: encoder/decoder cross-attention test
 
         decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata)
+            test_rsrcs,
+            decphase_dec_test_params,
+            None,
+            decphase_attn_metadata,
+            test_pt=test_pt)
 
         # - Is decode-phase encoder/decoder cross-attention correct?
         assert_actual_matches_ideal(decphase_cross_test_params,
-                                    decphase_cross_pckd_act_out)
+                                    decphase_cross_pckd_act_out,
+                                    attn_backend.name)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index c3d5252edc2a3..e7865fb2500ef 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,8 +13,8 @@
 
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
-                        make_tensor_with_pad)
+from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
+                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
@@ -525,17 +525,22 @@ def make_backend(backend_name: str) -> AttentionBackend:
     if backend_name == STR_XFORMERS_ATTN_VAL:
         # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
         from vllm.attention.backends.xformers import XFormersBackend
-
         return XFormersBackend()
+    elif backend_name == STR_FLASH_ATTN_VAL:
+        from vllm.attention.backends.flash_attn import FlashAttentionBackend
+        return FlashAttentionBackend()
+
     raise AssertionError(
         f"Unrecognized backend_name {backend_name} for unit test")
 
 
 def _make_metadata_tensors(
-    seq_lens: Optional[List[int]], context_lens: Optional[List[int]],
-    encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str]
-) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]],
-           torch.Tensor, Optional[int]]:
+    seq_lens: Optional[List[int]],
+    context_lens: Optional[List[int]],
+    encoder_seq_lens: Optional[List[int]],
+    device: Union[torch.device, str],
+) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
+           torch.Tensor, torch.Tensor, Optional[int]]:
     '''
     Build scalar & tensor values required to build attention metadata structure.
 
@@ -553,6 +558,8 @@ def _make_metadata_tensors(
     * max_context_len: max(context_lens)
     * max_seq_len: max(seq_lens)
     * seq_start_loc: start idx of each sequence
+    * encoder_seq_lens_tensor: encoder seq_lens list, as tensor
+    * encoder_seq_start_loc: start idx of each encoder sequence
     * max_encoder_seq_len: encoder seq_lens list, as tensor
     '''
     seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
@@ -566,8 +573,26 @@ def _make_metadata_tensors(
 
     seq_start_loc = None
 
+    if seq_lens_tensor is not None:
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=seq_lens_tensor.device)
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+
+    encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1,
+                                        dtype=torch.int32,
+                                        device=encoder_seq_lens_tensor.device)
+    torch.cumsum(encoder_seq_lens_tensor,
+                 dim=0,
+                 dtype=encoder_seq_start_loc.dtype,
+                 out=encoder_seq_start_loc[1:])
+
     return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
-            seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len)
+            seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc,
+            max_encoder_seq_len)
 
 
 def make_kv_cache(num_blocks: int,
@@ -575,6 +600,7 @@ def make_kv_cache(num_blocks: int,
                   head_size: int,
                   block_size: int,
                   device: Union[torch.device, str],
+                  backend: str,
                   default_val: float = 0.0) -> torch.Tensor:
     '''
     Create a fake KV cache.
@@ -591,10 +617,20 @@ def make_kv_cache(num_blocks: int,
     Returns:
 
     * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
+    *     for backend 'XFORMERS' 
+    * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
+    *     for backend 'FLASH_ATTN'  
     '''
-
-    kv_cache = torch.rand(
-        (2, num_blocks, block_size * num_heads * head_size)).to(device)
+    if backend == 'XFORMERS':
+        kv_cache = torch.rand(
+            (2, num_blocks, block_size * num_heads * head_size)).to(device)
+    elif backend == 'FLASH_ATTN':
+        kv_cache = torch.rand(
+            (2, num_blocks, block_size, num_heads, head_size)).to(device)
+    else:
+        raise ValueError(
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+            f"'FLASH_ATTN'.")
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
@@ -858,8 +894,9 @@ def make_test_metadata(
             context_lens_tensor,
             _,
             _,
-            _,
+            seq_start_loc,
             encoder_seq_lens_tensor,
+            encoder_seq_start_loc,
             max_encoder_seq_len,
         ) = _make_metadata_tensors(seq_lens,
                                    context_lens,
@@ -874,6 +911,7 @@ def make_test_metadata(
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
+            seq_start_loc=seq_start_loc,
             max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
             max_decode_seq_len=0,
             context_lens_tensor=context_lens_tensor,
@@ -882,6 +920,7 @@ def make_test_metadata(
             num_encoder_tokens=num_encoder_tokens,
             encoder_seq_lens=encoder_seq_lens,
             encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            encoder_seq_start_loc=encoder_seq_start_loc,
             max_encoder_seq_len=max_encoder_seq_len,
             cross_slot_mapping=(None if cross_kv_mmap is None else
                                 cross_kv_mmap.slot_mapping),
@@ -904,8 +943,9 @@ def make_test_metadata(
             context_lens_tensor,
             _,
             _,
-            _,
+            seq_start_loc,
             encoder_seq_lens_tensor,
+            encoder_seq_start_loc,
             max_encoder_seq_len,
         ) = _make_metadata_tensors(seq_lens,
                                    context_lens,
@@ -920,14 +960,17 @@ def make_test_metadata(
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
+            seq_start_loc=seq_start_loc,
             max_prefill_seq_len=0,
             max_decode_seq_len=max(seq_lens),
+            max_decode_query_len=1,
             context_lens_tensor=context_lens_tensor,
             block_tables=kv_mmap.block_tables,
             use_cuda_graph=False,
             num_encoder_tokens=num_encoder_tokens,
             encoder_seq_lens=encoder_seq_lens,
             encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            encoder_seq_start_loc=encoder_seq_start_loc,
             max_encoder_seq_len=max_encoder_seq_len,
             cross_slot_mapping=(None if cross_kv_mmap is None else
                                 cross_kv_mmap.slot_mapping),
@@ -936,7 +979,8 @@ def make_test_metadata(
 
 
 def assert_actual_matches_ideal(test_params: PhaseTestParameters,
-                                output_under_test: torch.Tensor) -> None:
+                                output_under_test: torch.Tensor,
+                                backend: str) -> None:
     '''
     Assert that observed output matches the ideal output
     contained in the test parameters data structure.
@@ -947,8 +991,22 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
     * output_under_test: actually observed output value
     '''
     ideal_output = test_params.packed_qkvo.ideal_output
-    torch.testing.assert_close(ideal_output,
-                               output_under_test.view_as(ideal_output))
+    if backend == 'XFORMERS':
+        torch.testing.assert_close(ideal_output,
+                                   output_under_test.view_as(ideal_output))
+
+    elif backend == 'FLASH_ATTN':
+        # For FlashAttention override the accuracy thresholds to non default
+        # values since we notice a higher difference between the ideal and
+        # actual output.
+        torch.testing.assert_close(ideal_output,
+                                   output_under_test.view_as(ideal_output),
+                                   atol=0.01,
+                                   rtol=0.016)
+    else:
+        raise ValueError(
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+            f"'FLASH_ATTN'.")
 
 
 # Copied/modified from torch._refs.__init__.py
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index 483773f069133..d686f1da3fa17 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -85,7 +85,7 @@ def run_test(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ab363ac78b028..2975a41797e9f 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -10,10 +10,11 @@
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionType)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
-                                           compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
+from vllm.attention.backends.utils import (
+    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
+    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
+    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
+    is_all_encoder_attn_metadata_set, is_block_tables_empty)
 from vllm.forward_context import get_forward_context
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
@@ -73,7 +74,6 @@ def swap_blocks(
         src_key_cache = src_kv_cache[0]
         dst_key_cache = dst_kv_cache[0]
         ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-
         src_value_cache = src_kv_cache[1]
         dst_value_cache = dst_kv_cache[1]
         ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
@@ -85,6 +85,7 @@ def copy_blocks(
     ) -> None:
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
         value_caches = [kv_cache[1] for kv_cache in kv_caches]
+
         ops.copy_blocks(key_caches, value_caches, src_to_dists)
 
 
@@ -111,26 +112,12 @@ class FlashAttentionMetadata(AttentionMetadata):
     # |-------------------- seq_len ---------------------|
     #                                   |-- query_len ---|
 
-    # Maximum query length in the batch.
-    max_query_len: Optional[int]
-
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int]
-
     # Maximum sequence length among prefill batch. 0 if there are decoding
     # requests only.
     max_prefill_seq_len: int
     # Maximum sequence length among decode batch. 0 if there are prefill
     # requests only.
     max_decode_seq_len: int
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
     context_lens_tensor: Optional[torch.Tensor]
@@ -146,11 +133,62 @@ class FlashAttentionMetadata(AttentionMetadata):
     # Whether or not if cuda graph is enabled.
     # Cuda-graph is currently enabled for decoding only.
     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
     use_cuda_graph: bool
 
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
     _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None
     _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return is_all_encoder_attn_metadata_set(self)
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return is_all_cross_attn_metadata_set(self)
+
     @property
     def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
         if self.num_prefills == 0:
@@ -159,32 +197,52 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
         if self._cached_prefill_metadata is not None:
             return self._cached_prefill_metadata
 
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
-        assert self.block_tables is not None
-        assert self.seq_start_loc is not None
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
 
         self._cached_prefill_metadata = FlashAttentionMetadata(
             num_prefills=self.num_prefills,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
-            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
-            seq_lens=self.seq_lens[:self.num_prefills],
-            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_query_len=0,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
-            block_tables=self.block_tables[:self.num_prefills],
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
             use_cuda_graph=False,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_prefill_metadata
 
     @property
@@ -194,17 +252,25 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
 
         if self._cached_decode_metadata is not None:
             return self._cached_decode_metadata
-        assert self.block_tables is not None
-        assert self.seq_lens_tensor is not None
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
 
         self._cached_decode_metadata = FlashAttentionMetadata(
             num_prefills=0,
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
             seq_lens=None,
-            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
+            seq_lens_tensor=seq_lens_tensor,
             max_decode_query_len=self.max_decode_query_len,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=0,
@@ -214,9 +280,15 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             seq_start_loc=self.seq_start_loc[self.num_prefills:]
             if self.seq_start_loc is not None else None,
             context_lens_tensor=None,
-            block_tables=self.block_tables[self.num_prefills:],
+            block_tables=block_tables,
             use_cuda_graph=self.use_cuda_graph,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
     def advance_step(self,
@@ -586,16 +658,20 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttentionImpl")
-
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
         output = torch.ops.vllm.unified_flash_attention(
             query,
             key,
@@ -608,6 +684,7 @@ def forward(
             k_scale,
             v_scale,
             self.scale,
+            attn_type.value,
             self.sliding_window,
             self.alibi_slopes,
             self.logits_soft_cap,
@@ -616,6 +693,89 @@ def forward(
         return output
 
 
+def _get_query_key_seq_metadata(
+    attn_metadata,
+    is_prompt: bool,
+    attn_type: AttentionType,
+) -> tuple:
+    """
+    Returns sequence metadata for key and query based on the specified 
+    attention type and whether input is a prompt.
+
+    This function computes the starting locations and maximum sequence lengths 
+    for key and query sequences for different attention types.
+
+    Args:
+        attn_metadata: The attention metadata object
+        is_prompt (bool): A flag indicating if the input is a prompt
+        attn_type (AttentionType): The type of attention being used.
+
+    Returns:
+        tuple: A tuple containing four integers:
+            - Starting location for the query sequence.
+            - Maximum sequence length for the query sequence.
+            - Starting location for the key sequence.
+            - Maximum sequence length for the key sequence.
+
+    Raises:
+        AttributeError: If an invalid attention type is provided.
+    """
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_start_loc, max_seq_len,
+                attn_metadata.seq_start_loc, max_seq_len)
+
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # This is cross attention between the where the key
+        # is the precomputed encoder attention and query
+        # is the input sequence.
+        # Choose query max length based on whether it is prompt
+        # or not.
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_start_loc, max_seq_len,
+                attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len)
+    elif attn_type == AttentionType.ENCODER:
+        # For encoder attention both the query and the key are same i.e the
+        # encoder sequence.
+        return (attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len)
+    elif attn_type == AttentionType.ENCODER_ONLY:
+        assert is_prompt, "Should not have decode for encoder only model."
+        return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len,
+                attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def _get_causal_option(attn_type: AttentionType) -> bool:
+    """
+    Determine whether the given attention type is suitable for causal 
+    attention mechanisms.
+
+    Args:
+        attn_type (AttentionType): The type of attention being evaluated
+
+    Returns:
+        bool: Returns `True` if the attention type is suitable for causal 
+        attention (i.e., not encoder, encoder-only, or encoder-decoder), 
+        otherwise returns `False`.
+    """
+    return not (attn_type == AttentionType.ENCODER
+                or attn_type == AttentionType.ENCODER_ONLY
+                or attn_type == AttentionType.ENCODER_DECODER)
+
+
 def unified_flash_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -628,60 +788,76 @@ def unified_flash_attention(
     k_scale: float,
     v_scale: float,
     softmax_scale: float,
+    attn_type_int_val: int,
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
 
+    # Convert integer attn_type to enum
+    try:
+        attn_type = AttentionType(attn_type_int_val)
+    except ValueError as err:
+        raise AttributeError(
+            f"Invalid attention type {str(attn_type_int_val)}") from err
+
     current_metadata = get_forward_context()
     assert current_metadata is not None
     assert isinstance(current_metadata, FlashAttentionMetadata)
     attn_metadata: FlashAttentionMetadata = current_metadata
 
     num_tokens, hidden_size = query.shape
+
     # Reshape the query, key, and value tensors.
     query = query.view(-1, num_heads, head_size)
-    key = key.view(-1, num_kv_heads, head_size)
-    value = value.view(-1, num_kv_heads, head_size)
+    if (key is not None) and (value is not None):
+        key = key.view(-1, num_kv_heads, head_size)
+        value = value.view(-1, num_kv_heads, head_size)
 
     if kv_cache.numel() > 0:
         key_cache = kv_cache[0]
         value_cache = kv_cache[1]
+        # We skip updating the KV cache under two conditions:
+        #  a. When the Attention Type is ENCODER. In this phase, we compute
+        #     only the encoder attention without updating the cache.
+        #  b. When both Key and Value are None. This occurs during
+        #     cross-attention computation in the decoding phase, where the KV
+        #     cache is already populated with the cross-attention tensor.
+        #     Thus, we skip cache updates during this time.
+        if (attn_type != AttentionType.ENCODER) and (key is not None) and (
+                value is not None):
+            if attn_type == AttentionType.ENCODER_DECODER:
+                # Update cross-attention KV cache (prefill-only)
+                updated_slot_mapping = attn_metadata.cross_slot_mapping
+            else:
+                # Update self-attention KV cache (prefill/decode)
+                updated_slot_mapping = attn_metadata.slot_mapping
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[0],
+                kv_cache[1],
+                updated_slot_mapping.flatten(),  # type: ignore[union-attr]
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
 
-        # Reshape the input keys and values and store them in the cache.
-        # If kv_cache is not provided, the new key and value tensors are
-        # not cached. This happens during the initial memory profiling run.
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            kv_cache[0],
-            kv_cache[1],
-            attn_metadata.slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-
-    num_prefill_tokens = attn_metadata.num_prefill_tokens
-    num_decode_tokens = attn_metadata.num_decode_tokens
-    assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
-    assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
-
-    # Query for decode. KV is not needed because it is already cached.
-    decode_query = query[num_prefill_tokens:]
+    (num_prefill_query_tokens, num_prefill_kv_tokens,
+    num_decode_query_tokens) = \
+        get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
+    decode_query = query[num_prefill_query_tokens:]
     # QKV for prefill.
-    query = query[:num_prefill_tokens]
-    key = key[:num_prefill_tokens]
-    value = value[:num_prefill_tokens]
-
-    assert query.shape[0] == num_prefill_tokens
-    assert decode_query.shape[0] == num_decode_tokens
+    query = query[:num_prefill_query_tokens]
+    assert query.shape[0] == num_prefill_query_tokens
+    assert decode_query.shape[0] == num_decode_query_tokens
 
     prefill_output: Optional[torch.Tensor] = None
     decode_output: Optional[torch.Tensor] = None
-
     if prefill_meta := attn_metadata.prefill_metadata:
         # Prompt run.
         if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
@@ -689,22 +865,30 @@ def unified_flash_attention(
             # normal attention
             # When block_tables are not filled, it means q and k are the
             # prompt, and they have the same length.
+            q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
+                _get_query_key_seq_metadata(prefill_meta, True, attn_type)
+
+            key = key[:num_prefill_kv_tokens]
+            value = value[:num_prefill_kv_tokens]
+
             prefill_output = flash_attn_varlen_func(
                 q=query,
                 k=key,
                 v=value,
-                cu_seqlens_q=prefill_meta.seq_start_loc,
-                cu_seqlens_k=prefill_meta.seq_start_loc,
-                max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                cu_seqlens_q=q_seq_start_loc,
+                cu_seqlens_k=k_seq_start_loc,
+                max_seqlen_q=q_seq_len,
+                max_seqlen_k=k_seq_len,
                 softmax_scale=softmax_scale,
-                causal=True,
+                causal=_get_causal_option(attn_type),
                 window_size=window_size,
                 alibi_slopes=alibi_slopes,
                 softcap=logits_soft_cap,
             )
         else:
             # prefix-enabled attention
+            assert attn_type == AttentionType.DECODER, (
+                "Only decoder-only models support prefix caching")
             assert prefill_meta.seq_lens is not None
             max_seq_len = max(prefill_meta.seq_lens)
             prefill_output = flash_attn_varlen_func(  # noqa
@@ -729,6 +913,8 @@ def unified_flash_attention(
         # because different queries might have different lengths.
         assert decode_meta.max_decode_query_len is not None
         if decode_meta.max_decode_query_len > 1:
+            assert attn_type == AttentionType.DECODER, (
+                "Only decoder-only models support max_decode_query_len > 1")
             decode_output = flash_attn_varlen_func(
                 q=decode_query,
                 k=key_cache,
@@ -746,12 +932,17 @@ def unified_flash_attention(
             )
         else:
             # Use flash_attn_with_kvcache for normal decoding.
+            (
+                seq_lens_arg,
+                _,
+                block_tables_arg,
+            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
             decode_output = flash_attn_with_kvcache(
                 q=decode_query.unsqueeze(1),
                 k_cache=key_cache,
                 v_cache=value_cache,
-                block_table=decode_meta.block_tables,
-                cache_seqlens=decode_meta.seq_lens_tensor,
+                block_table=block_tables_arg,
+                cache_seqlens=seq_lens_arg,
                 softmax_scale=softmax_scale,
                 causal=True,
                 window_size=window_size,
@@ -761,10 +952,10 @@ def unified_flash_attention(
 
     if prefill_output is None:
         assert decode_output is not None
-        return decode_output.view(num_decode_tokens, hidden_size)
+        return decode_output.view(num_decode_query_tokens, hidden_size)
     if decode_output is None:
         assert prefill_output is not None
-        return prefill_output.view(num_prefill_tokens, hidden_size)
+        return prefill_output.view(num_prefill_query_tokens, hidden_size)
 
     # Chunked prefill does not work with speculative decoding.
     # Therefore, the query length for decode should be 1 in chunked prefill.
@@ -786,6 +977,7 @@ def unified_flash_attention_fake(
     k_scale: float,
     v_scale: float,
     softmax_scale: float,
+    attn_type_int_val: int,
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 55293bbb06e1d..096c920c4833a 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,13 +1,14 @@
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Dict, List, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
 
 import numpy as np
 import torch
 
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
+from vllm.attention.backends.abstract import AttentionType
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
@@ -336,11 +337,13 @@ def graph_capture_get_metadata_for_batch(
             use_cuda_graph=True,
         )
         if is_encoder_decoder_model:
-            # The encoder decoder model works only with XFormers backend.
-            # Assert the same.
-            assert self.runner.attn_backend.get_name() == "XFORMERS", \
-            f"Expected attn_backend name to be 'XFORMERS', but "\
-            f" got '{self.runner.attn_backend.get_name()}'"
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in\
+                ["XFORMERS", "FLASH_ATTN"], \
+                f"Expected attn_backend name to be either 'XFORMERS' or " \
+                f"'FLASH_ATTN', but "\
+                f"got '{self.runner.attn_backend.get_name()}'"
             self._update_captured_metadata_for_enc_dec_model(
                 batch_size=batch_size, attn_metadata=attn_metadata)
 
@@ -356,11 +359,13 @@ def get_graph_input_buffers(
             "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         if is_encoder_decoder_model:
-            # The encoder decoder model works only with XFormers backend.
-            # Assert the same.
-            assert self.runner.attn_backend.get_name() == "XFORMERS", \
-            f"Expected attn_backend name to be 'XFORMERS', but "\
-            f" got '{self.runner.attn_backend.get_name()}'"
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in\
+                ["XFORMERS", "FLASH_ATTN"], \
+                f"Expected attn_backend name to be either 'XFORMERS' or "\
+                f"'FLASH_ATTN', but "\
+                f"got '{self.runner.attn_backend.get_name()}'"
             self._add_additonal_input_buffers_for_enc_dec_model(
                 attn_metadata=attn_metadata, input_buffers=input_buffers)
         return input_buffers
@@ -375,11 +380,13 @@ def prepare_graph_input_buffers(
         input_buffers["block_tables"].copy_(
             attn_metadata.decode_metadata.block_tables, non_blocking=True)
         if is_encoder_decoder_model:
-            # The encoder decoder model works only with XFormers backend.
-            # Assert the same.
-            assert self.runner.attn_backend.get_name() == "XFORMERS", \
-            f"Expected attn_backend name to be 'XFORMERS', but "\
-            f" got '{self.runner.attn_backend.get_name()}'"
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in\
+                ["XFORMERS", "FLASH_ATTN"], \
+                f"Expected attn_backend name to be either 'XFORMERS' or "\
+                f"'FLASH_ATTN', but "\
+                f"got '{self.runner.attn_backend.get_name()}'"
             self._prepare_input_buffers_for_enc_dec_model(
                 attn_metadata, input_buffers)
 
@@ -411,6 +418,7 @@ def _update_captured_metadata_for_enc_dec_model(self, batch_size: int,
         attn_metadata.encoder_seq_lens_tensor = torch.full(
             (batch_size, ), 1, dtype=torch.int).cuda()
         attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
+        attn_metadata.num_encoder_tokens = 0
 
     def _add_additonal_input_buffers_for_enc_dec_model(
             self, attn_metadata, input_buffers: Dict[str, Any]):
@@ -453,3 +461,122 @@ def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata,
         input_buffers["cross_block_tables"].copy_(
             attn_metadata.decode_metadata.cross_block_tables,
             non_blocking=True)
+
+
+def is_all_encoder_attn_metadata_set(attn_metadata):
+    '''
+    All attention metadata required for encoder attention is set.
+    '''
+    return ((attn_metadata.encoder_seq_lens is not None)
+            and (attn_metadata.encoder_seq_lens_tensor is not None)
+            and (attn_metadata.max_encoder_seq_len is not None))
+
+
+def is_all_cross_attn_metadata_set(attn_metadata):
+    '''
+    All attention metadata required for enc/dec cross-attention is set.
+
+    Superset of encoder attention required metadata.
+    '''
+    return (attn_metadata.is_all_encoder_attn_metadata_set
+            and (attn_metadata.cross_slot_mapping is not None)
+            and (attn_metadata.cross_block_tables is not None))
+
+
+def get_seq_len_block_table_args(
+    attn_metadata,
+    is_prompt: bool,
+    attn_type: AttentionType,
+) -> tuple:
+    '''
+    The particular choice of sequence-length- and block-table-related
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths & 
+                                  cross-attn block-tables fields
+    Encoder attn -> select encoder sequence lengths fields & no block tables
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * is_prompt: True if prefill, False otherwise
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensor
+    * Appropriate max sequence-length scalar
+    * Appropriate block tables (or None)
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_lens_tensor, max_seq_len,
+                attn_metadata.block_tables)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.cross_block_tables)
+    elif attn_type == AttentionType.ENCODER:
+        # No block tables associated with encoder attention
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len, None)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def get_num_prefill_decode_query_kv_tokens(
+    attn_metadata,
+    attn_type: AttentionType,
+) -> Tuple[int, int, int]:
+    """
+    Calculate the number of prefill and decode tokens for query, key/value
+    based on the attention metadata and the specified attention type.
+
+    Args:
+        attn_metadata (FlashAttentionMetadata): Attention Metadata object.
+        attn_type (AttentionType): The type of attention being used.
+    Returns:
+        Tuple[int, int, int]: A tuple containing three integers:
+            - The number of prefill query tokens.
+            - The number of prefill key/value tokens.
+            - The number of decode query tokens.
+
+    Raises:
+        AssertionError: If the number of encoder tokens in `attn_metadata` 
+        is `None` when required for the calculations.
+    """
+    num_prefill_query_tokens = 0
+    num_decode_query_tokens = 0
+    num_prefill_kv_tokens = 0
+    if attn_type == AttentionType.ENCODER:
+        # Encoder attention is only invoked during prefill phase.
+        # The same input servers a both query and key.
+        assert attn_metadata.num_encoder_tokens is not None
+        num_prefill_query_tokens = attn_metadata.num_encoder_tokens
+        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+        num_decode_query_tokens = 0
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        assert attn_metadata.num_encoder_tokens is not None
+        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+        # The key is the encoder/cross-attention.
+        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+        num_decode_query_tokens = attn_metadata.num_decode_tokens
+    else:  # attn_type == AttentionType.DECODER or
+        # attn_type == AttentionType.ENCODER_ONLY
+        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+        num_prefill_kv_tokens = attn_metadata.num_prefill_tokens
+        num_decode_query_tokens = attn_metadata.num_decode_tokens
+
+    return (num_prefill_query_tokens, num_prefill_kv_tokens,
+            num_decode_query_tokens)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 21877f2dded0e..4725413baade7 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -11,8 +11,10 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
-from vllm.attention.backends.utils import (CommonAttentionState,
-                                           CommonMetadataBuilder)
+from vllm.attention.backends.utils import (
+    CommonAttentionState, CommonMetadataBuilder,
+    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
+    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -135,6 +137,11 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     # Encoder sequence lengths representation
     encoder_seq_lens: Optional[List[int]] = None
     encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # FIXME: It is for flash attn.
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
 
     # Maximum sequence length among encoder sequences
     max_encoder_seq_len: Optional[int] = None
@@ -162,9 +169,7 @@ def is_all_encoder_attn_metadata_set(self):
         '''
         All attention metadata required for encoder attention is set.
         '''
-        return ((self.encoder_seq_lens is not None)
-                and (self.encoder_seq_lens_tensor is not None)
-                and (self.max_encoder_seq_len is not None))
+        return is_all_encoder_attn_metadata_set(self)
 
     @property
     def is_all_cross_attn_metadata_set(self):
@@ -173,9 +178,7 @@ def is_all_cross_attn_metadata_set(self):
 
         Superset of encoder attention required metadata.
         '''
-        return (self.is_all_encoder_attn_metadata_set
-                and (self.cross_slot_mapping is not None)
-                and (self.cross_block_tables is not None))
+        return is_all_cross_attn_metadata_set(self)
 
     @property
     def prefill_metadata(self) -> Optional["XFormersMetadata"]:
@@ -329,64 +332,6 @@ def _set_attn_bias(
         raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
 
-def _get_seq_len_block_table_args(
-    attn_metadata: XFormersMetadata,
-    is_prompt: bool,
-    attn_type: AttentionType,
-) -> tuple:
-    '''
-    The particular choice of sequence-length- and block-table-related
-    attributes which should be extracted from attn_metadata is dependent
-    on the type of attention operation.
-
-    Decoder attn -> select entirely decoder self-attention-related fields
-    Encoder/decoder cross-attn -> select encoder sequence lengths & 
-                                  cross-attn block-tables fields
-    Encoder attn -> select encoder sequence lengths fields & no block tables
-    
-    Arguments:
-
-    * attn_metadata: Attention metadata structure associated with attention op
-    * is_prompt: True if prefill, False otherwise
-    * attn_type: encoder attention, decoder self-attention,
-                 encoder/decoder cross-attention
-
-    Returns:
-
-    * Appropriate sequence-lengths tensor
-    * Appropriate max sequence-length scalar
-    * Appropriate block tables (or None)
-    '''
-
-    if attn_type == AttentionType.DECODER:
-        # Decoder self-attention
-        # Choose max_seq_len based on whether we are in prompt_run
-        if is_prompt:
-            max_seq_len = attn_metadata.max_prefill_seq_len
-        else:
-            max_seq_len = attn_metadata.max_decode_seq_len
-        return (attn_metadata.seq_lens_tensor, max_seq_len,
-                attn_metadata.block_tables)
-    elif attn_type == AttentionType.ENCODER_DECODER:
-        # Enc/dec cross-attention KVs match encoder sequence length;
-        # cross-attention utilizes special "cross" block tables
-        return (attn_metadata.encoder_seq_lens_tensor,
-                attn_metadata.max_encoder_seq_len,
-                attn_metadata.cross_block_tables)
-    elif attn_type == AttentionType.ENCODER:
-        # No block tables associated with encoder attention
-        return (attn_metadata.encoder_seq_lens_tensor,
-                attn_metadata.max_encoder_seq_len, None)
-    elif attn_type == AttentionType.ENCODER_ONLY:
-        assert is_prompt, "Should not have decode for encoder only model."
-
-        # No block tables associated with encoder attention
-        return (attn_metadata.seq_lens_tensor,
-                attn_metadata.max_prefill_seq_len, None)
-    else:
-        raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
 class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]):
 
     _metadata_cls = XFormersMetadata
@@ -574,45 +519,21 @@ def forward(
                                                     updated_slot_mapping,
                                                     self.kv_cache_dtype,
                                                     k_scale, v_scale)
-
-        if attn_type == AttentionType.ENCODER:
-            # Encoder attention - chunked prefill is not applicable;
-            # derive token-count from query shape & and treat them
-            # as 100% prefill tokens
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-            num_encoder_tokens = attn_metadata.num_encoder_tokens
-            num_decode_tokens = 0
-        elif attn_type == AttentionType.DECODER:
-            # Decoder self-attention supports chunked prefill.
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            num_encoder_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
-            # Only enforce this shape-constraint for decoder
-            # self-attention
-            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
-        else:  # attn_type == AttentionType.ENCODER_DECODER
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            if attn_metadata.num_encoder_tokens is not None:
-                num_encoder_tokens = attn_metadata.num_encoder_tokens
-            else:
-                num_encoder_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
+        (num_prefill_query_tokens, num_prefill_kv_tokens,
+        num_decode_query_tokens) = \
+            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
-        decode_query = query[num_prefill_tokens:]
+        decode_query = query[num_prefill_query_tokens:]
         # QKV for prefill.
-        query = query[:num_prefill_tokens]
+        query = query[:num_prefill_query_tokens]
         if key is not None and value is not None:
-            key = key[:num_encoder_tokens]
-            value = value[:num_encoder_tokens]
+            key = key[:num_prefill_kv_tokens]
+            value = value[:num_prefill_kv_tokens]
 
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
+        assert query.shape[0] == num_prefill_query_tokens
+        assert decode_query.shape[0] == num_decode_query_tokens
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
@@ -622,8 +543,8 @@ def forward(
                 # prefix.
                 out = self._run_memory_efficient_xformers_forward(
                     query, key, value, prefill_meta, attn_type=attn_type)
-                assert out.shape == output[:num_prefill_tokens].shape
-                output[:num_prefill_tokens] = out
+                assert out.shape == output[:num_prefill_query_tokens].shape
+                output[:num_prefill_query_tokens] = out
             else:
                 assert attn_type != AttentionType.ENCODER_ONLY, (
                     "Encoder-only models should not have prefix attention.")
@@ -652,8 +573,8 @@ def forward(
                     k_scale,
                     v_scale,
                 )
-                assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
+                assert output[:num_prefill_query_tokens].shape == out.shape
+                output[:num_prefill_query_tokens] = out
 
         if decode_meta := attn_metadata.decode_metadata:
             assert attn_type != AttentionType.ENCODER_ONLY, (
@@ -663,9 +584,9 @@ def forward(
                 seq_lens_arg,
                 max_seq_len_arg,
                 block_tables_arg,
-            ) = _get_seq_len_block_table_args(decode_meta, False, attn_type)
+            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
 
-            output[num_prefill_tokens:] = PagedAttention.forward_decode(
+            output[num_prefill_query_tokens:] = PagedAttention.forward_decode(
                 decode_query,
                 key_cache,
                 value_cache,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 376b3136f0fb8..8a59cf41a689e 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -98,7 +98,6 @@ def get_attn_backend(
     is_blocksparse: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
-
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
         from vllm.attention.backends.blocksparse_attn import (
@@ -108,6 +107,7 @@ def get_attn_backend(
     backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
                                 is_attention_free)
     if backend == _Backend.FLASH_ATTN:
+        logger.info("Using Flash Attention backend.")
         from vllm.attention.backends.flash_attn import (  # noqa: F401
             FlashAttentionBackend)
         return FlashAttentionBackend
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index cbdacf779b089..0543ca978b7dd 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -624,8 +624,6 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
             Decoder output torch.Tensor
         """
         # retrieve input_ids and inputs_embeds
-
-        input_ids = input_ids.view(-1, input_ids.shape[-1])
         inputs_embeds = self.embed_tokens(input_ids)
 
         embed_pos = self.embed_positions(
diff --git a/vllm/utils.py b/vllm/utils.py
index 5488719cc99b0..1041120a24b3f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -80,8 +80,8 @@
                                  "currently supported with encoder/"
                                  "decoder models.")
 
-STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers is the only backend "
-                                "currently supported with encoder/"
+STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only "
+                                "backends currently supported with encoder/"
                                 "decoder models.")
 
 STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not "
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index a4b665d71f28a..2ea314f8608ee 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -19,6 +19,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.utils import get_architecture_class_name
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
@@ -36,6 +37,11 @@
 
 logger = init_logger(__name__)
 
+# The Mllama model has PagedAttention specific logic because of which it
+# can only be run with the XFORMERS backend
+# TODO Make Mllama model work with Flash Attention backend.
+_XFORMERS_ONLY_ENCODER_DECODER_ARCHS = ["MllamaForConditionalGeneration"]
+
 
 @dataclasses.dataclass(frozen=True)
 class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
@@ -101,9 +107,7 @@ def __init__(
         models) but these arguments are present here for compatibility with 
         the base-class constructor.
         '''
-
-        self._maybe_force_supported_attention_backend()
-
+        self._maybe_force_supported_attention_backend(model_config)
         super().__init__(
             model_config,
             parallel_config,
@@ -119,7 +123,12 @@ def __init__(
         # Crash for unsupported encoder/scenarios
         assert_enc_dec_mr_supported_scenario(self)
 
-    def _maybe_force_supported_attention_backend(self):
+    def _is_xformers_only_encoder_decoder_model(self,
+                                                model: ModelConfig) -> bool:
+        return get_architecture_class_name(
+            model) in _XFORMERS_ONLY_ENCODER_DECODER_ARCHS
+
+    def _maybe_force_supported_attention_backend(self, model: ModelConfig):
         '''
         Force vLLM to use the XFormers attention backend,
         which is currently the only supported option.
@@ -135,22 +144,26 @@ def raise_backend_err():
         is_forced_by_global = maybe_global_forced_backend is not None
         is_forced_by_env_var = maybe_env_var_forced_backend is not None
 
-        if not (is_forced_by_global or is_forced_by_env_var):
+        if not (is_forced_by_global or is_forced_by_env_var) \
+            and self._is_xformers_only_encoder_decoder_model(model):
             # The user has not already specified an attention backend
             # override
-            logger.info("EncoderDecoderModelRunner requires "
-                        "XFormers backend; overriding backend "
-                        "auto-selection and forcing XFormers.")
+            logger.info(
+                "Encoder-Decoder Model Architecture %s requires XFormers "
+                "backend; overriding backend auto-selection and "
+                "forcing XFormers.", get_architecture_class_name(model))
             global_force_attn_backend(_Backend.XFORMERS)
         elif is_forced_by_global:
             # Backend override enforced by global variable takes
             # precedence over vLLM backend environment variable.
-            if maybe_global_forced_backend != _Backend.XFORMERS:
+            if maybe_global_forced_backend not in\
+                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
                 raise_backend_err()
         elif is_forced_by_env_var:
             # Backend override enforced by vLLM backend
             # environment variable
-            if maybe_env_var_forced_backend != _Backend.XFORMERS:
+            if maybe_env_var_forced_backend not in\
+                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
                 raise_backend_err()
 
     def _list_to_int32_tensor(
@@ -532,6 +545,7 @@ def _prepare_encoder_model_input_tensors(
             attn_metadata.encoder_seq_lens,
             attn_metadata.encoder_seq_lens_tensor,
             attn_metadata.max_encoder_seq_len,
+            attn_metadata.encoder_seq_start_loc,
             attn_metadata.cross_slot_mapping,
             attn_metadata.cross_block_tables,
         ) = (
@@ -539,6 +553,7 @@ def _prepare_encoder_model_input_tensors(
             encoder_seq_lens,
             encoder_seq_lens_tensor,
             max_encoder_seq_len,
+            encoder_seq_start_loc,
             cross_slot_mapping_tensor,
             cross_block_tables,
         )

From af7380d83b0d67726a4a6c7a86766423bed6a7a8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 1 Nov 2024 23:35:47 -0700
Subject: [PATCH 24/85] [torch.compile] fix cpu broken code (#9947)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 1041120a24b3f..a742ec8d76908 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1551,7 +1551,14 @@ def direct_register_custom_op(
     """
     if is_in_doc_build():
         return
-    schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    import torch.library
+    if hasattr(torch.library, "infer_schema"):
+        schema_str = torch.library.infer_schema(op_func,
+                                                mutates_args=mutates_args)
+    else:
+        # for pytorch 2.4
+        import torch._custom_op.impl
+        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
     my_lib.define(op_name + schema_str)
     my_lib.impl(op_name, op_func, "CUDA")

From eed92f12fc829ff074e7341283cb1677b7e65aa2 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sat, 2 Nov 2024 09:02:18 +0000
Subject: [PATCH 25/85] [Docs] Update Granite 3.0 models in supported models
 table (#9930)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 80714a90df5c2..a5c085bb84db9 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -160,13 +160,13 @@ Text Generation
     -
     - ✅︎
   * - :code:`GraniteForCausalLM`
-    - PowerLM
-    - :code:`ibm/PowerLM-3b` etc.
+    - Granite 3.0, PowerLM
+    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
     - ✅︎
     - ✅︎
   * - :code:`GraniteMoeForCausalLM`
-    - PowerMoE
-    - :code:`ibm/PowerMoE-3b` etc.
+    - Granite 3.0 MoE, PowerMoE
+    - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
     - ✅︎
     - ✅︎
   * - :code:`InternLMForCausalLM`

From 1d4cfe2be1907408d610489bdca7bc8f8d2345b1 Mon Sep 17 00:00:00 2001
From: Michael Green <59619482+mikegre-google@users.noreply.github.com>
Date: Sat, 2 Nov 2024 14:06:45 +0000
Subject: [PATCH 26/85] [Doc] Updated tpu-installation.rst with more details
 (#9926)

Signed-off-by: Michael Green <mikegre@google.com>
---
 .../getting_started/tpu-installation.rst      | 158 ++++++++++++++++--
 1 file changed, 144 insertions(+), 14 deletions(-)

diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index edba209986f6a..f0c812b941c1f 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -1,35 +1,167 @@
 .. _installation_tpu:
 
+#####################
 Installation with TPU
-=====================
+#####################
 
-vLLM supports Google Cloud TPUs using PyTorch XLA.
+Tensor Processing Units (TPUs) are Google's custom-developed application-specific 
+integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs 
+are available in different versions each with different hardware specifications.
+For more information about TPUs, see `TPU System Architecture <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm>`_. 
+For more information on the TPU versions supported with vLLM, see:
+
+* `TPU v6e <https://cloud.google.com/tpu/docs/v6e>`_
+* `TPU v5e <https://cloud.google.com/tpu/docs/v5e>`_
+* `TPU v5p <https://cloud.google.com/tpu/docs/v5p>`_
+* `TPU v4 <https://cloud.google.com/tpu/docs/v4>`_
+
+These TPU versions allow you to configure the physical arrangements of the TPU 
+chips. This can improve throughput and networking performance. For more 
+information see: 
+
+* `TPU v6e topologies <https://cloud.google.com/tpu/docs/v6e#configurations>`_
+* `TPU v5e topologies <https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config>`_
+* `TPU v5p topologies <https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config>`_
+* `TPU v4 topologies <https://cloud.google.com/tpu/docs/v4#tpu-v4-config>`_
+
+In order for you to use Cloud TPUs you need to have TPU quota granted to your 
+Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
+GPC project and are specified in terms of TPU version, the number of TPU you 
+want to use, and quota type. For more information, see `TPU quota <https://cloud.google.com/tpu/docs/quota#tpu_quota>`_. 
+
+For TPU pricing information, see `Cloud TPU pricing <https://cloud.google.com/tpu/pricing>`_.
+
+You may need additional persistent storage for your TPU VMs. For more 
+information, see `Storage options for Cloud TPU data <https://cloud.devsite.corp.google.com/tpu/docs/storage-options>`_.
 
 Requirements
 ------------
 
-* Google Cloud TPU VM (single & multi host)
-* TPU versions: v5e, v5p, v4
-* Python: 3.10
+* Google Cloud TPU VM 
+* TPU versions: v6e, v5e, v5p, v4
+* Python: 3.10 or newer
+
+Provision Cloud TPUs
+====================
+
+You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_` 
+or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_` 
+API. This section shows how to create TPUs using the queued resource API. 
+For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
+`Queued resources <https://cloud.devsite.corp.google.com/tpu/docs/queued-resources>`_
+enable you to request Cloud TPU resources in a queued manner. When you request 
+queued resources, the request is added to a queue maintained by the Cloud TPU 
+service. When the requested resource becomes available, it's assigned to your 
+Google Cloud project for your immediate exclusive use. 
+
+Provision a Cloud TPU with the queued resource API
+--------------------------------------------------
+Create a TPU v5e with 4 TPU chips:
+
+.. code-block:: console
+
+    gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
+    --node-id TPU_NAME \
+    --project PROJECT_ID \
+    --zone ZONE \
+    --accelerator-type ACCELERATOR_TYPE \
+    --runtime-version RUNTIME_VERSION \
+    --service-account SERVICE_ACCOUNT
+
+.. list-table:: Parameter descriptions
+    :header-rows: 1
+
+    * - Parameter name
+      - Description
+    * - QUEUED_RESOURCE_ID
+      - The user-assigned ID of the queued resource request.
+    * - TPU_NAME
+      - The user-assigned name of the TPU which is created when the queued 
+        resource request is allocated.
+    * - PROJECT_ID
+      - Your Google Cloud project
+    * - ZONE
+      - The `zone <https://cloud.google.com/tpu/docs/regions-zones>`_ where you 
+        want to create your Cloud TPU.
+    * - ACCELERATOR_TYPE
+      - The TPU version you want to use. Specify the TPU version, followed by a 
+        '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU 
+        with 4 cores. For more information, see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+    * - RUNTIME_VERSION
+      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+    * - SERVICE_ACCOUNT
+      - The email address for your service account. You can find it in the IAM 
+        Cloud Console under *Service Accounts*. For example: 
+        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
+
+Connect to your TPU using SSH:
+
+.. code-block:: bash
+
+    gcloud compute tpus tpu-vm ssh TPU_NAME
+
+Create and activate a Conda environment for vLLM:
+
+.. code-block:: bash
 
-Installation options:
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
 
-1. :ref:`Build a docker image with Dockerfile <build_docker_tpu>`.
-2. :ref:`Build from source <build_from_source_tpu>`.
+Clone the vLLM repository and go to the vLLM directory:
+
+.. code-block:: bash
+
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+
+Uninstall the existing `torch` and `torch_xla` packages:
+
+.. code-block:: bash
+
+    pip uninstall torch torch-xla -y
+
+Install `torch` and `torch_xla`
+
+.. code-block:: bash
+
+    pip install --pre torch==2.6.0.dev20241028+cpu torchvision==0.20.0.dev20241028+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
+    pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html
+
+Install JAX and Pallas:
+
+.. code-block:: bash
+
+    pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+    pip install jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+Install other build dependencies:
+
+.. code-block:: bash
+
+    pip install -r requirements-tpu.txt
+    VLLM_TARGET_DEVICE="tpu" python setup.py develop
+    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
+
+Provision Cloud TPUs with GKE 
+-----------------------------
+
+For more information about using TPUs with GKE, see 
+https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
+https://cloud.google.com/kubernetes-engine/docs/concepts/tpus
+https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus
 
 .. _build_docker_tpu:
 
 Build a docker image with :code:`Dockerfile.tpu`
 ------------------------------------------------
 
-`Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ is provided to build a docker image with TPU support.
+You can use `Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ 
+to build a Docker image with TPU support.
 
 .. code-block:: console
 
     $ docker build -f Dockerfile.tpu -t vllm-tpu .
 
-
-You can run the docker image with the following command:
+Run the Docker image with the following command:
 
 .. code-block:: console
 
@@ -75,14 +207,12 @@ Next, build vLLM from source. This will only take a few seconds:
 
     $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
 
-
 .. note::
 
     Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
     The compilation time may take 20~30 minutes in the first run.
     However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
 
-
 .. tip::
 
     If you encounter the following error:
@@ -93,7 +223,7 @@ Next, build vLLM from source. This will only take a few seconds:
         ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
 
 
-    Please install OpenBLAS with the following command:
+    Install OpenBLAS with the following command:
 
     .. code-block:: console
 

From e8937954434037ac787efa800f01d9d294185439 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 07:35:05 -0700
Subject: [PATCH 27/85] [2/N] executor pass the complete config to
 worker/modelrunner (#9938)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/lora/test_long_context.py               |  8 +--
 tests/lora/test_worker.py                     | 12 +++--
 tests/spec_decode/utils.py                    |  7 +--
 .../test_encoder_decoder_model_runner.py      |  9 +---
 tests/worker/test_model_runner.py             | 10 +---
 tests/worker/test_profile.py                  |  7 +--
 tests/worker/test_swap.py                     |  7 +--
 vllm/config.py                                | 24 ++++-----
 vllm/engine/arg_utils.py                      | 13 ++---
 vllm/engine/async_llm_engine.py               |  8 +--
 vllm/engine/llm_engine.py                     |  9 ++--
 vllm/engine/multiprocessing/client.py         |  4 +-
 vllm/executor/cpu_executor.py                 |  9 +---
 vllm/executor/executor_base.py                |  4 +-
 vllm/executor/gpu_executor.py                 | 11 +---
 vllm/executor/neuron_executor.py              |  6 +--
 vllm/executor/openvino_executor.py            |  8 +--
 vllm/executor/tpu_executor.py                 |  7 +--
 vllm/spec_decode/draft_model_runner.py        | 36 ++-----------
 vllm/spec_decode/ngram_worker.py              |  2 +-
 vllm/spec_decode/spec_decode_worker.py        | 35 ++++++-------
 vllm/spec_decode/target_model_runner.py       | 34 ++++---------
 vllm/v1/engine/llm_engine.py                  |  9 ++--
 vllm/v1/executor/gpu_executor.py              | 11 +---
 vllm/v1/worker/gpu_model_runner.py            | 41 +++++++--------
 vllm/v1/worker/gpu_worker.py                  | 50 +++++++------------
 vllm/worker/cpu_model_runner.py               | 25 +++-------
 vllm/worker/cpu_worker.py                     | 37 ++++----------
 vllm/worker/embedding_model_runner.py         | 26 ++--------
 vllm/worker/enc_dec_model_runner.py           | 25 ++--------
 vllm/worker/model_runner.py                   | 28 +++--------
 vllm/worker/model_runner_base.py              | 17 +++++++
 vllm/worker/multi_step_model_runner.py        |  1 +
 vllm/worker/multi_step_worker.py              | 10 +---
 vllm/worker/neuron_model_runner.py            | 16 ++----
 vllm/worker/neuron_worker.py                  | 20 +++-----
 vllm/worker/openvino_model_runner.py          | 33 +++++-------
 vllm/worker/openvino_worker.py                | 34 +++----------
 vllm/worker/tpu_model_runner.py               | 17 ++-----
 vllm/worker/tpu_worker.py                     | 28 +++--------
 vllm/worker/worker.py                         | 45 ++++-------------
 vllm/worker/worker_base.py                    | 18 ++++++-
 vllm/worker/xpu_model_runner.py               | 29 +++--------
 vllm/worker/xpu_worker.py                     | 40 +++------------
 44 files changed, 250 insertions(+), 580 deletions(-)

diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index c8edb02a88d4b..eada902c891f7 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -138,13 +138,7 @@ def test_rotary_emb_replaced(dist_init):
                              enable_lora=True)
     engine_config = engine_args.create_engine_config()
     model_runner = ModelRunner(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
-        lora_config=engine_config.lora_config,
+        vllm_config=engine_config,
         is_driver_worker=True,
     )
     model_runner.load_model()
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 2f7ac85507425..9d814f657ac43 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -4,7 +4,8 @@
 from unittest.mock import patch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig)
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
@@ -12,7 +13,7 @@
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
-    worker = Worker(
+    vllm_config = VllmConfig(
         model_config=ModelConfig(
             "meta-llama/Llama-2-7b-hf",
             task="auto",
@@ -34,10 +35,13 @@ def test_worker_apply_lora(sql_lora_files):
                                  gpu_memory_utilization=1.,
                                  swap_space=0,
                                  cache_dtype="auto"),
-        local_rank=0,
-        rank=0,
         lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                                max_loras=32),
+    )
+    worker = Worker(
+        vllm_config=vllm_config,
+        local_rank=0,
+        rank=0,
         distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
     )
     worker.init_device()
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index f683942a5854b..6cf0cfb09b8fa 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -81,12 +81,7 @@ def create_worker(cls: Callable[..., T],
         get_ip(), get_open_port())
 
     worker = cls(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
+        vllm_config=engine_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index e75884a7395e2..9e166ae64dbfb 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -19,14 +19,7 @@ def _create_model_runner(model: str, *args,
     engine_args = EngineArgs(model, *args, **kwargs)
     engine_config = engine_args.create_engine_config()
     model_runner = EncoderDecoderModelRunner(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
-        lora_config=engine_config.lora_config,
-        prompt_adapter_config=engine_config.prompt_adapter_config,
+        vllm_config=engine_config,
         is_driver_worker=True,
     )
     return model_runner
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index fe97199bac62d..433a9b30ba57a 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -16,15 +16,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
     engine_args = EngineArgs(model, *args, **kwargs)
     engine_config = engine_args.create_engine_config()
     model_runner = ModelRunner(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
-        lora_config=engine_config.lora_config,
-        prompt_adapter_config=engine_config.prompt_adapter_config,
-        observability_config=engine_config.observability_config,
+        vllm_config=engine_config,
         is_driver_worker=True,
     )
     return model_runner
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index acd2ed6836365..194ea2aa506f4 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -24,12 +24,7 @@ def test_gpu_memory_profiling():
     distributed_init_method = get_distributed_init_method(
         get_ip(), get_open_port())
     worker = Worker(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
+        vllm_config=engine_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 7aa439ba0a154..acede959f59f8 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -19,12 +19,7 @@ def test_swap() -> None:
     distributed_init_method = get_distributed_init_method(
         get_ip(), get_open_port())
     worker = Worker(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
+        vllm_config=engine_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/vllm/config.py b/vllm/config.py
index c2a8c956b374a..17e9b1c100498 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,6 +1,6 @@
 import enum
 import json
-from dataclasses import dataclass, field, fields
+from dataclasses import dataclass, field
 from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal,
                     Mapping, Optional, Set, Tuple, Type, Union)
 
@@ -1941,9 +1941,9 @@ def __post_init__(self):
                 f"installed. Original error:\n{otel_import_error_traceback}")
 
 
-@dataclass(frozen=True)
-class EngineConfig:
-    """Dataclass which contains all engine-related configuration. This
+@dataclass
+class VllmConfig:
+    """Dataclass which contains all vllm-related configuration. This
     simplifies passing around the distinct configurations in the codebase.
     """
 
@@ -1953,11 +1953,11 @@ class EngineConfig:
     scheduler_config: SchedulerConfig
     device_config: DeviceConfig
     load_config: LoadConfig
-    lora_config: Optional[LoRAConfig]
-    speculative_config: Optional[SpeculativeConfig]
-    decoding_config: Optional[DecodingConfig]
-    observability_config: Optional[ObservabilityConfig]
-    prompt_adapter_config: Optional[PromptAdapterConfig]
+    lora_config: Optional[LoRAConfig] = None
+    speculative_config: Optional[SpeculativeConfig] = None
+    decoding_config: Optional[DecodingConfig] = None
+    observability_config: Optional[ObservabilityConfig] = None
+    prompt_adapter_config: Optional[PromptAdapterConfig] = None
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
@@ -1975,9 +1975,3 @@ def __post_init__(self):
         if self.prompt_adapter_config:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
-
-    def to_dict(self):
-        """Return the configs as a dictionary, for use in **kwargs.
-        """
-        return dict(
-            (field.name, getattr(self, field.name)) for field in fields(self))
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b1f0f8b9df925..da06ab186821e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -9,10 +9,11 @@
 
 import vllm.envs as envs
 from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
-                         DeviceConfig, EngineConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, ObservabilityConfig,
-                         ParallelConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig)
+                         DeviceConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
+                         VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -955,7 +956,7 @@ def create_load_config(self) -> LoadConfig:
             ignore_patterns=self.ignore_patterns,
         )
 
-    def create_engine_config(self) -> EngineConfig:
+    def create_engine_config(self) -> VllmConfig:
         # gguf file needs a specific model loader and doesn't use hf_repo
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
@@ -1167,7 +1168,7 @@ def create_engine_config(self) -> EngineConfig:
             or "all" in detailed_trace_modules,
         )
 
-        return EngineConfig(
+        return VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 6aeaf484a22b4..b0fdc67776bbd 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -7,8 +7,8 @@
 from weakref import ReferenceType
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -604,7 +604,7 @@ def __del__(self):
 
     @classmethod
     def _get_executor_cls(
-            cls, engine_config: EngineConfig) -> Type[ExecutorAsyncBase]:
+            cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]:
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
         if isinstance(distributed_executor_backend, type):
@@ -663,7 +663,7 @@ def _get_executor_cls(
     def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
-        engine_config: Optional[EngineConfig] = None,
+        engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e6fe1effb8287..b12d29c4a8503 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -13,8 +13,9 @@
 from typing_extensions import TypeIs, TypeVar
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
@@ -219,7 +220,7 @@ def validate_outputs(
 
     def __init__(
         self,
-        vllm_config: EngineConfig,
+        vllm_config: VllmConfig,
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -500,7 +501,7 @@ def _initialize_kv_caches(self) -> None:
 
     @classmethod
     def _get_executor_cls(cls,
-                          engine_config: EngineConfig) -> Type[ExecutorBase]:
+                          engine_config: VllmConfig) -> Type[ExecutorBase]:
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
         # Initialize the cluster and specify the executor class.
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 6e6630b3ff55f..7f1ca621d91c4 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -13,7 +13,7 @@
 from zmq.asyncio import Socket
 
 from vllm import PoolingParams
-from vllm.config import DecodingConfig, EngineConfig, ModelConfig
+from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 # yapf conflicts with isort for this block
@@ -78,7 +78,7 @@ class MQLLMEngineClient(EngineClient):
             every N seconds, confirming the engine is healthy
     """
 
-    def __init__(self, ipc_path: str, engine_config: EngineConfig,
+    def __init__(self, ipc_path: str, engine_config: VllmConfig,
                  engine_pid: int):
         self.context = zmq.asyncio.Context()
         self._errored_with: Optional[BaseException] = None
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index e32993e0e452e..ab3ebb4e43d18 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -138,18 +138,11 @@ def _create_worker(
         assert self.distributed_init_method is not None
 
         kwargs = dict(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=self.distributed_init_method,
-            lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
-            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=rank == 0,
         )
         wrapper.init_worker(**kwargs)
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 2248eecd1849f..9cba189dd57f9 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
-from vllm.config import EngineConfig
+from vllm.config import VllmConfig
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -20,7 +20,7 @@ class ExecutorBase(ABC):
 
     def __init__(
         self,
-        vllm_config: EngineConfig,
+        vllm_config: VllmConfig,
     ) -> None:
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index ed30d3186a453..c65d0836e5ff7 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -49,21 +49,12 @@ def _get_worker_kwargs(
             distributed_init_method = get_distributed_init_method(
                 get_ip(), get_open_port())
         return dict(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            speculative_config=self.speculative_config,
-            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=(not self.parallel_config)
             or (rank % self.parallel_config.tensor_parallel_size == 0),
-            observability_config=self.observability_config,
         )
 
     def _get_worker_module_and_class(
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index f2fcfa58b26e1..02d37cd7fbf23 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -29,11 +29,7 @@ def _init_worker(self):
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
         self.driver_worker = NeuronWorker(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
+            vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method)
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index d0c0333854dae..d06b0ccb7906e 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -48,16 +48,10 @@ def _init_worker(self):
             get_ip(), get_open_port())
         self.driver_worker = OpenVINOWorker(
             ov_core=self.ov_core,
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 972649dedf33e..e37e8973790db 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -44,12 +44,7 @@ def _get_worker_kwargs(
             distributed_init_method = get_distributed_init_method(
                 get_ip(), get_open_port())
         return dict(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 3aa999fcb9ebb..17cc0ad1a4a3a 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -17,9 +17,6 @@
         "Draft model speculative decoding currently only supports"
         "CUDA and ROCm flash attention backend.") from err
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalInputs
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
@@ -49,40 +46,13 @@ class TP1DraftModelRunner(ModelRunner):
        any broadcasting inside execute_model).
     """
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        return_hidden_states: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
-    ):
-        if return_hidden_states:
+    def __init__(self, *args, **kwargs):
+        if kwargs.get("return_hidden_states"):
             raise ValueError(
                 "return_hidden_states is not supported for TP1DraftModelRunner."
             )
 
-        super().__init__(
-            model_config=model_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            lora_config=lora_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker,
-            prompt_adapter_config=prompt_adapter_config,
-            return_hidden_states=return_hidden_states,
-            observability_config=observability_config,
-        )
+        super().__init__(*args, **kwargs)
 
     def _update_sampling_metadata(self, sampling_metadata, num_seqs,
                                   num_queries):
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index a777e5c3f22a7..debb3b2d5ec30 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -21,7 +21,7 @@ class NGramWorker(NonLLMProposerWorkerBase):
     def __init__(self, *args, **kwargs):
         # Get local_rank/vocab_size from kwargs attribute
         self.local_rank = kwargs["local_rank"]
-        self.vocab_size = kwargs["model_config"].get_vocab_size()
+        self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size()
 
         # Lazy initialization list.
         self._proposer: Top1Proposer
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 9f7ef2f8d851c..a402181b13db8 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,10 +1,11 @@
+import copy
 from collections import defaultdict
 from functools import cached_property
 from typing import Any, Dict, List, Optional, Set, Tuple, Type
 
 import torch
 
-from vllm.config import ParallelConfig, SpeculativeConfig
+from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
@@ -45,8 +46,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     """Helper method that is the entrypoint for Executors which use
     WorkerWrapper. It constructs a SpecDecodeWorker from the speculative config.
     """
-    assert "speculative_config" in kwargs
-    speculative_config: SpeculativeConfig = kwargs.get("speculative_config")
+    vllm_config: VllmConfig = kwargs.get("vllm_config")
+    speculative_config: SpeculativeConfig = vllm_config.speculative_config
     assert speculative_config is not None
 
     draft_worker_kwargs = kwargs.copy()
@@ -58,14 +59,16 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     target_worker.model_runner.disable_logprobs =\
          speculative_config.disable_logprobs
 
+    draft_worker_config = copy.deepcopy(vllm_config)
+    draft_worker_config.model_config = speculative_config.draft_model_config
+    draft_worker_config.parallel_config = speculative_config.draft_parallel_config  # noqa
+    # TODO allow draft-model specific load config.
+
     # Override draft-model specific worker args.
     draft_worker_kwargs.update(
-        model_config=speculative_config.draft_model_config,
-        parallel_config=speculative_config.draft_parallel_config,
+        vllm_config=draft_worker_config,
         ngram_prompt_lookup_max=speculative_config.ngram_prompt_lookup_max,
         ngram_prompt_lookup_min=speculative_config.ngram_prompt_lookup_min,
-        # TODO allow draft-model specific load config.
-        #load_config=load_config,
     )
 
     spec_decode_worker = SpecDecodeWorker.create_worker(
@@ -134,29 +137,27 @@ def create_worker(
             draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
         ngram_prompt_lookup_min = (
             draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
+        draft_model_config = draft_worker_kwargs["vllm_config"].model_config
+        draft_parallel_config: ParallelConfig = draft_worker_kwargs[
+            'vllm_config'].parallel_config
         if ngram_prompt_lookup_max > 0:
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
         else:
-            draft_parallel_config: ParallelConfig = draft_worker_kwargs[
-                'parallel_config']
             draft_tp = draft_parallel_config.tensor_parallel_size
             target_tp = scorer_worker.parallel_config.tensor_parallel_size
 
-            if draft_worker_kwargs[
-                    "model_config"].hf_config.model_type == "mlp_speculator":
+            if draft_model_config.hf_config.model_type == "mlp_speculator":
                 proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
-            elif draft_worker_kwargs[
-                    "model_config"].hf_config.model_type == "medusa":
+            elif draft_model_config.hf_config.model_type == "medusa":
                 proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
                 if draft_tp == 1:
                     draft_worker_kwargs[
                         "model_runner_cls"] = TP1DraftModelRunner
                 else:
-                    if draft_worker_kwargs[
-                            "model_config"].hf_config.model_type == "eagle":
+                    if draft_model_config.hf_config.model_type == "eagle":
                         raise NotImplementedError(
                             "EAGLE does not support TP > 1 yet")
 
@@ -190,8 +191,8 @@ def create_worker(
                     "[Speculative Decoding] Disabling MQA scorer as the "
                     "MQA is only available with flash attn backend.")
 
-            if "model_config" in draft_worker_kwargs and \
-                draft_worker_kwargs["model_config"].max_model_len < \
+            if draft_model_config and \
+                draft_model_config.max_model_len < \
                     scorer_worker.model_config.max_model_len:
                 disable_mqa_scorer = True
                 logger.info(
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index 2bb7af7d7c600..e61cde5b17f20 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,8 +1,6 @@
 from typing import List, Optional
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.sequence import SequenceGroupMetadata
 from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
                                       ModelRunner)
@@ -20,35 +18,21 @@ class TargetModelRunner(ModelRunner):
     requested or not. 
     """
 
-    def __init__(self,
-                 model_config: ModelConfig,
-                 parallel_config: ParallelConfig,
-                 scheduler_config: SchedulerConfig,
-                 device_config: DeviceConfig,
-                 cache_config: CacheConfig,
-                 load_config: LoadConfig,
-                 lora_config: Optional[LoRAConfig],
-                 kv_cache_dtype: Optional[str] = "auto",
-                 is_driver_worker: bool = False,
-                 prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-                 return_hidden_states: bool = False,
-                 observability_config: Optional[ObservabilityConfig] = None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        return_hidden_states: bool = False,
+    ):
         # An internal boolean member variable to indicate if token log
         # probabilities are needed or not.
         self.disable_logprobs = True
         super().__init__(
-            model_config=model_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            lora_config=lora_config,
+            vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
-            prompt_adapter_config=prompt_adapter_config,
             return_hidden_states=return_hidden_states,
-            observability_config=observability_config,
         )
 
     def prepare_model_input(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index febabd2f31036..64cc18149d6c5 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,8 +2,9 @@
 from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type,
                     Union)
 
-from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig)
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
@@ -32,7 +33,7 @@ class LLMEngine:
 
     def __init__(
         self,
-        vllm_config: EngineConfig,
+        vllm_config: VllmConfig,
         executor_class: Type[GPUExecutor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -477,7 +478,7 @@ def get_lora_config(self) -> LoRAConfig:
         return self.lora_config
 
     @classmethod
-    def _get_executor_cls(cls, engine_config: EngineConfig):
+    def _get_executor_cls(cls, engine_config: VllmConfig):
         return GPUExecutor
 
     def is_tracing_enabled(self) -> bool:
diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/gpu_executor.py
index c780c7031c3d6..b12c500f1f9ee 100644
--- a/vllm/v1/executor/gpu_executor.py
+++ b/vllm/v1/executor/gpu_executor.py
@@ -56,19 +56,10 @@ def _create_worker(
             distributed_init_method = get_distributed_init_method(
                 get_ip(), get_open_port())
         return Worker(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            load_config=self.load_config,
+            vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            speculative_config=self.speculative_config,
-            prompt_adapter_config=self.prompt_adapter_config,
-            observability_config=self.observability_config,
         )
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e84645ac7a4ae..77c1e10ab6bdf 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -7,9 +7,7 @@
 import torch.distributed
 import torch.nn as nn
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
@@ -33,26 +31,25 @@ class GPUModelRunner:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
+        vllm_config: VllmConfig,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
-
+        # TODO: use ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
+        model_config = self.model_config
+        cache_config = self.cache_config
+        scheduler_config = self.scheduler_config
+        parallel_config = self.parallel_config
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 8c5ca2ec35666..c8192b7f86eb0 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -6,10 +6,7 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
@@ -30,48 +27,35 @@ class Worker:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        speculative_config: Optional[SpeculativeConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+
+        # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.speculative_config = speculative_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner = GPUModelRunner(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config,
-            lora_config=lora_config,
-        )
+        self.model_runner = GPUModelRunner(vllm_config)
 
     def initialize(self):
         if self.device_config.device.type == "cuda":
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 0c6fcdf03ba9e..a98faa2f2d0cb 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -8,9 +8,7 @@
 from torch import nn
 
 from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, PromptAdapterConfig,
-                         SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -412,29 +410,18 @@ class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         *args,
         **kwargs,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
+        ModelRunnerBase.__init__(self, vllm_config)
         # Currently, CPU worker doesn't support chunked prefill.
         assert self.scheduler_config.chunked_prefill_enabled is False
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.load_config = load_config
+        model_config = self.model_config
+        cache_config = self.cache_config
+
         self.is_driver_worker = is_driver_worker
 
         self.device = self.device_config.device
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index ab93471b5af74..3778707ae07e8 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -6,9 +6,8 @@
 
 import vllm.envs as envs
 from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, PromptAdapterConfig,
-                         SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, VllmConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -18,7 +17,8 @@
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerInput)
+                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -121,31 +121,19 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
         kv_cache_dtype: Optional[str] = "auto",
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.prompt_adapter_config = prompt_adapter_config
+
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -166,15 +154,8 @@ def __init__(
         if self._is_encoder_decoder_model():
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunner = ModelRunnerClass(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=self.load_config,
-            lora_config=self.lora_config,
+            vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
-            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index a7f5b2d4fdd1f..ff288d5ca1512 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -3,9 +3,7 @@
 
 import torch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
@@ -36,29 +34,13 @@ class EmbeddingModelRunner(
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
     ):
-        super().__init__(model_config,
-                         parallel_config,
-                         scheduler_config,
-                         device_config,
-                         cache_config,
-                         load_config,
-                         lora_config=lora_config,
+        super().__init__(vllm_config=vllm_config,
                          kv_cache_dtype=kv_cache_dtype,
-                         is_driver_worker=is_driver_worker,
-                         prompt_adapter_config=prompt_adapter_config,
-                         observability_config=observability_config)
+                         is_driver_worker=is_driver_worker)
 
     @torch.inference_mode()
     def execute_model(
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 2ea314f8608ee..90a43196084ea 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -11,9 +11,7 @@
 from vllm.attention.selector import (_Backend, get_env_variable_attn_backend,
                                      get_global_forced_attn_backend,
                                      global_force_attn_backend)
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import ModelConfig, VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -85,17 +83,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
@@ -107,15 +97,10 @@ def __init__(
         models) but these arguments are present here for compatibility with 
         the base-class constructor.
         '''
-        self._maybe_force_supported_attention_backend(model_config)
+        self._maybe_force_supported_attention_backend(vllm_config.model_config)
+
         super().__init__(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config,
-            lora_config=None,
+            vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
         )
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index f2123c64c3274..0e200e6abb05e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -20,9 +20,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.levels import CompilationLevel
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
@@ -955,32 +953,20 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         return_hidden_states: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
+
+        ModelRunnerBase.__init__(self, vllm_config)
+        model_config = self.model_config
+        cache_config = self.cache_config
+
         self.is_driver_worker = is_driver_worker
-        self.prompt_adapter_config = prompt_adapter_config
         self.return_hidden_states = return_hidden_states
-        self.observability_config = observability_config
 
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 89d7addb5a8d9..9e529f86b46bb 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -9,6 +9,7 @@
 import torch
 from torch import is_tensor
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.platforms import current_platform
@@ -220,6 +221,22 @@ class ModelRunnerBase(ABC, Generic[T]):
     ModelRunnerInputBase subclass.
     """
 
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
     # Map of request_id -> generator used for seeded random sampling
     generators: Dict[str, torch.Generator] = {}
 
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index be2f0d79154d6..3ee0fb4dc943e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -304,6 +304,7 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
     # mypy: enable-error-code=type-var
 
     def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
+
         super().__init__(*args, **kwargs)
 
         # Check attention backend support.
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index bf66f32d7d244..1f982fe103366 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -27,17 +27,9 @@ def __init__(self, *args, **kwargs):
         # for multi-step model, wrap the model runner with MultiStepModelRunner
         self.model_runner = MultiStepModelRunner(
             base_model_runner,
-            base_model_runner.model_config,
-            base_model_runner.parallel_config,
-            base_model_runner.scheduler_config,
-            base_model_runner.device_config,
-            base_model_runner.cache_config,
-            load_config=base_model_runner.load_config,
-            lora_config=self.lora_config,
+            vllm_config=base_model_runner.vllm_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=base_model_runner.is_driver_worker,
-            prompt_adapter_config=base_model_runner.prompt_adapter_config,
-            observability_config=base_model_runner.observability_config,
         )
 
         pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index b8c760c4b5396..2da22cbfc7cb5 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -7,8 +7,7 @@
 from torch import nn
 from transformers_neuronx.config import GenerationConfig
 
-from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -57,20 +56,13 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
+        vllm_config: VllmConfig,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-
+        ModelRunnerBase.__init__(self, vllm_config)
+        model_config = self.model_config
         if model_config is not None and model_config.get_sliding_window():
             logger.warning("Sliding window is not supported on Neuron. "
                            "The model will run without sliding window.")
-        self.device_config = (device_config
-                              if device_config is not None else DeviceConfig())
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index fff14d6402b44..3f6269684ac93 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -4,15 +4,15 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerInput)
+                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 
 class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
@@ -21,20 +21,12 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
+        WorkerBase.__init__(self, vllm_config=vllm_config)
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
@@ -44,7 +36,7 @@ def __init__(
             init_cached_hf_modules()
 
         self.model_runner: NeuronModelRunner = NeuronModelRunner(
-            model_config, parallel_config, scheduler_config, device_config)
+            vllm_config=vllm_config)
         self.is_driver_worker = True
 
     def init_device(self) -> None:
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 3da738636a59d..c9c87ea748081 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -7,9 +7,7 @@
 
 from vllm.attention import get_attn_backend
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -17,6 +15,7 @@
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs, MultiModalPlaceholderMap)
 from vllm.sequence import SequenceGroupMetadata
+from vllm.worker.model_runner_base import ModelRunnerBase
 
 logger = init_logger(__name__)
 
@@ -39,33 +38,21 @@ def empty(cls, device):
                           multi_modal_kwargs={})
 
 
-class OpenVINOModelRunner:
+class OpenVINOModelRunner(ModelRunnerBase):
 
     def __init__(
         self,
         ov_core: ov.Core,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        multimodal_config: Optional[MultiModalConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         *args,
         **kwargs,
     ):
         self.ov_core = ov_core
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.multimodal_config = multimodal_config
-        self.load_config = load_config
+        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        cache_config = self.cache_config
+        model_config = self.model_config
         self.is_driver_worker = is_driver_worker
 
         self.device = self.device_config.device
@@ -369,3 +356,9 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
         return output
+
+    def prepare_model_input(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def make_model_input_from_broadcasted_tensor_dict(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index a420d390c1ae4..205f8a337ce6c 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -7,9 +7,8 @@
 
 import vllm.envs as envs
 from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, VllmConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -22,7 +21,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
 from vllm.worker.openvino_model_runner import OpenVINOModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
 logger = init_logger(__name__)
 
@@ -212,33 +211,19 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
     def __init__(
         self,
         ov_core: ov.Core,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        multimodal_config: Optional[MultiModalConfig] = None,
         kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
         is_driver_worker: bool = False,
     ) -> None:
         self.ov_core = ov_core
-        self.model_config = model_config
-        self.parallel_config = parallel_config
+        WorkerBase.__init__(self, vllm_config)
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -250,14 +235,7 @@ def __init__(
             init_cached_hf_modules()
         self.model_runner = OpenVINOModelRunner(
             self.ov_core,
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=self.load_config,
-            lora_config=self.lora_config,
-            multimodal_config=self.multimodal_config,
+            vllm_config=self.vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
         )
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 3792cbc0f730f..7d9d669a45ce3 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -12,8 +12,7 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
@@ -90,20 +89,10 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         is_driver_worker: bool = False,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
         self.is_driver_worker = is_driver_worker
 
         self.block_size = self.cache_config.block_size
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index de6f7ab0072fd..096cb23416909 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -6,8 +6,7 @@
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -16,7 +15,8 @@
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
 from vllm.worker.tpu_model_runner import TPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerInput)
+                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -25,24 +25,14 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
         is_driver_worker: bool,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
+        WorkerBase.__init__(self, vllm_config=vllm_config)
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
@@ -56,13 +46,7 @@ def __init__(
                 self.cache_config.cache_dtype]
 
         self.model_runner: TPUModelRunner = TPUModelRunner(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config,
-            is_driver_worker=is_driver_worker)
+            vllm_config=vllm_config, is_driver_worker=is_driver_worker)
 
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index fd30962e5d6bb..8928936b4f9fc 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -7,10 +7,7 @@
 import torch.distributed
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
@@ -27,7 +24,8 @@
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
-from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -42,46 +40,31 @@ class Worker(LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
     ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
+        WorkerBase.__init__(self, vllm_config)
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
-        if parallel_config and is_driver_worker:
-            assert rank % parallel_config.tensor_parallel_size == 0, \
+        if is_driver_worker:
+            assert rank % self.parallel_config.tensor_parallel_size == 0, \
                    "Driver worker should be rank 0 of tensor parallel group."
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.observability_config = observability_config
 
         # Return hidden states from target model if the draft model is an
         # mlp_speculator
+        speculative_config = self.speculative_config
+        model_config = self.model_config
         speculative_args = {} if speculative_config is None \
             or (speculative_config.draft_model_config.model ==
                 model_config.model) \
@@ -97,17 +80,9 @@ def __init__(
         elif self._is_encoder_decoder_model():
             ModelRunnerClass = EncoderDecoderModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=load_config,
-            lora_config=self.lora_config,
+            vllm_config=self.vllm_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            prompt_adapter_config=prompt_adapter_config,
-            observability_config=observability_config,
             **speculative_args,
         )
         # Uninitialized cache engine. Will be initialized by
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 6ba4f272315ce..cf8a4946a71c4 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from vllm.config import ObservabilityConfig
+from vllm.config import ObservabilityConfig, VllmConfig
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -29,6 +29,22 @@ class WorkerBase(ABC):
     communicate request metadata to other workers.
     """
 
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
     @abstractmethod
     def init_device(self) -> None:
         """Initialize device state, such as loading the model or other on-device
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 739fe1b3d2c4f..f37d70bee76ed 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -10,9 +10,7 @@
 import torch.nn as nn
 
 from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -363,33 +361,18 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         return_hidden_states: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
+
+        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        model_config = self.model_config
+        cache_config = self.cache_config
         self.is_driver_worker = is_driver_worker
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
-        if self.observability_config is not None:
-            print(f"observability_config is {self.observability_config}")
         self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index c1d836bb0d318..1295666055b04 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -8,10 +8,7 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -19,7 +16,7 @@
 from vllm.platforms import current_platform
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 from vllm.worker.xpu_model_runner import XPUModelRunner
 
 logger = init_logger(__name__)
@@ -36,53 +33,32 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
+        vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
-        observability_config: Optional[ObservabilityConfig] = None,
     ) -> None:
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+        device_config = self.device_config
+        parallel_config = self.parallel_config
         assert device_config.device_type == "xpu"
         assert current_platform.is_xpu()
 
-        self.model_config = model_config
-        self.parallel_config = parallel_config
         self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.load_config = load_config
+
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
-        self.observability_config = observability_config
         if parallel_config and is_driver_worker:
             assert rank % parallel_config.tensor_parallel_size == 0, \
                    "Driver worker should be rank 0 of tensor parallel group."
 
         self.model_runner = XPUModelRunner(  # type: ignore
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=self.load_config,
-            lora_config=self.lora_config,
+            vllm_config=vllm_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            observability_config=self.observability_config,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.

From d6459b4516dbac4f346ce29fe90d43ebfafa1114 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 2 Nov 2024 10:44:38 -0400
Subject: [PATCH 28/85] [V1] Fix `EngineArgs` refactor on V1 (#9954)

---
 vllm/v1/executor/gpu_executor.py | 39 ++++++++++----------------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/gpu_executor.py
index b12c500f1f9ee..de56332240192 100644
--- a/vllm/v1/executor/gpu_executor.py
+++ b/vllm/v1/executor/gpu_executor.py
@@ -1,10 +1,7 @@
 import os
 from typing import Optional, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import EngineConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.outputs import ModelRunnerOutput
@@ -15,29 +12,17 @@
 
 class GPUExecutor:
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        prompt_adapter_config: Optional[PromptAdapterConfig],
-        observability_config: Optional[ObservabilityConfig],
-    ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.speculative_config = speculative_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config
+    def __init__(self, vllm_config: EngineConfig) -> None:
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
 
         self.worker = self._create_worker()
         self.worker.initialize()

From 74b529ceeead8d4b44ded858f7c28bca9c1629ba Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 08:03:33 -0700
Subject: [PATCH 29/85] [bugfix] fix chatglm dummy_data_for_glmv (#9955)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/models/chatglm.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index ca90d10e9f9fb..c3c9ec703c1e6 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -14,8 +14,8 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -31,8 +31,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
-                             MultiModalInputs)
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
 from vllm.multimodal.base import MultiModalData
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -117,16 +116,15 @@ def get_max_glmv_image_tokens(ctx: InputContext):
     raise NotImplementedError(msg)
 
 
-def dummy_data_for_glmv(
-    ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]
-) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
+def dummy_data_for_glmv(ctx: InputContext, seq_len: int,
+                        mm_counts: Mapping[str, int]) -> DummyData:
     hf_config = ctx.get_hf_config(ChatGLMConfig)
     vision_config = getattr(hf_config, 'vision_config', None)
 
     if vision_config is None:
         token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len)
         seq_data = SequenceData(token_ids)
-        return seq_data, None
+        return DummyData(seq_data, None)
     elif isinstance(vision_config, dict):
         image_size = vision_config["image_size"]
         image_placeholder_length = calculate_image_placeholder(vision_config)
@@ -141,7 +139,7 @@ def dummy_data_for_glmv(
             "image": Image.new("RGB", (image_size, image_size), color=0)
         }
 
-        return seq_data, mm_data
+        return DummyData(seq_data, mm_data)
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)

From cea808f32549973cc19204355c950ad005eeed87 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 12:08:49 -0700
Subject: [PATCH 30/85] [3/N] model runner pass the whole config to model
 (#9958)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/lora/conftest.py                       |   9 +-
 vllm/model_executor/model_loader/__init__.py |  20 +--
 vllm/model_executor/model_loader/loader.py   | 132 ++++++++-----------
 vllm/plugins/__init__.py                     |  22 +++-
 vllm/v1/worker/gpu_model_runner.py           |   8 +-
 vllm/worker/cpu_model_runner.py              |   8 +-
 vllm/worker/model_runner.py                  |   8 +-
 vllm/worker/tpu_model_runner.py              |  10 +-
 vllm/worker/xpu_model_runner.py              |  10 +-
 9 files changed, 87 insertions(+), 140 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index e40f0dd74602e..816d3986fe333 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -248,11 +248,10 @@ def llama_2_7b_engine_extra_embeddings():
     cleanup_dist_env_and_memory(shutdown_ray=True)
     get_model_old = get_model
 
-    def get_model_patched(*, model_config, device_config, **kwargs):
-        kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
-        return get_model_old(model_config=model_config,
-                             device_config=device_config,
-                             **kwargs)
+    def get_model_patched(**kwargs):
+        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
+                                                       max_lora_rank=8)
+        return get_model_old(**kwargs)
 
     with patch("vllm.worker.model_runner.get_model", get_model_patched):
         engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index d1ec171c9ec2a..12468997e4653 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,27 +1,15 @@
-from typing import Optional
-
 from torch import nn
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.loader import (BaseModelLoader,
                                                      get_model_loader)
 from vllm.model_executor.model_loader.utils import (
     get_architecture_class_name, get_model_architecture)
 
 
-def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
-              device_config: DeviceConfig, parallel_config: ParallelConfig,
-              scheduler_config: SchedulerConfig,
-              lora_config: Optional[LoRAConfig],
-              cache_config: CacheConfig) -> nn.Module:
-    loader = get_model_loader(load_config)
-    return loader.load_model(model_config=model_config,
-                             device_config=device_config,
-                             lora_config=lora_config,
-                             parallel_config=parallel_config,
-                             scheduler_config=scheduler_config,
-                             cache_config=cache_config)
+def get_model(*, vllm_config: VllmConfig) -> nn.Module:
+    loader = get_model_loader(vllm_config.load_config)
+    return loader.load_model(vllm_config=vllm_config)
 
 
 __all__ = [
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 79703bb7ded7a..2cb9e0ca7c505 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -21,9 +21,9 @@
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, MultiModalConfig,
-                         ParallelConfig, PoolerConfig, SchedulerConfig)
+from vllm.config import (CacheConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PoolerConfig, SchedulerConfig, VllmConfig)
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
@@ -150,6 +150,7 @@ def _get_model_initialization_kwargs(
 
 
 def build_model(model_class: Type[nn.Module],
+                vllm_config: VllmConfig,
                 hf_config: PretrainedConfig,
                 cache_config: Optional[CacheConfig],
                 quant_config: Optional[QuantizationConfig],
@@ -166,23 +167,29 @@ def build_model(model_class: Type[nn.Module],
     if prefix:
         extra_kwargs["prefix"] = prefix
 
+    # TODO: unify all the module initialization code
+    # to only take the `VllmConfig` object as input
+    from vllm.plugins import set_vllm_config
+    set_vllm_config(vllm_config)
+
     return model_class(config=hf_config,
                        cache_config=cache_config,
                        quant_config=quant_config,
                        **extra_kwargs)
 
 
-def _initialize_model(
-        model_config: ModelConfig,
-        load_config: LoadConfig,
-        lora_config: Optional[LoRAConfig],
-        cache_config: CacheConfig,
-        scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module:
+def _initialize_model(vllm_config: VllmConfig) -> nn.Module:
     """Initialize a model with the given configurations."""
+    model_config = vllm_config.model_config
+    lora_config = vllm_config.lora_config
+    scheduler_config = vllm_config.scheduler_config
+    cache_config = vllm_config.cache_config
+    load_config = vllm_config.load_config
     model_class, _ = get_model_architecture(model_config)
 
     return build_model(
         model_class,
+        vllm_config,
         model_config.hf_config,
         cache_config=cache_config,
         quant_config=_get_quantization_config(model_config, load_config),
@@ -205,12 +212,7 @@ def download_model(self, model_config: ModelConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, *, vllm_config: VllmConfig) -> nn.Module:
         """Load a model with the given configurations."""
         raise NotImplementedError
 
@@ -396,18 +398,14 @@ def download_model(self, model_config: ModelConfig) -> None:
                               model_config.revision,
                               fall_back_to_pt=True)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config,
-                                          scheduler_config)
+                model = _initialize_model(vllm_config=vllm_config)
 
             model.load_weights(self._get_all_weights(model_config, model))
 
@@ -436,17 +434,12 @@ def __init__(self, load_config: LoadConfig):
     def download_model(self, model_config: ModelConfig) -> None:
         pass  # Nothing to download
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config,
-                                          scheduler_config)
+                model = _initialize_model(vllm_config=vllm_config)
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
@@ -488,10 +481,7 @@ def _get_weights_iterator(
 
     def _load_model_serialized_cpu(
         self,
-        model_config: ModelConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        cache_config: CacheConfig,
+        vllm_config: VllmConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer to the CPU.
 
@@ -500,26 +490,30 @@ def _load_model_serialized_cpu(
         default HuggingFace loading, but will be slower than loading a
         vLLM-tensorized model.
         """
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
 
             model.load_weights(self._get_weights_iterator())
         return model.eval()
 
     def _load_model_serialized(
         self,
-        model_config: ModelConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        cache_config: CacheConfig,
+        vllm_config: VllmConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer.
 
         Expects a vLLM-tensorized model. See the
         examples/tensorize_vllm_model.py example script
         for serializing vLLM models."""
+
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+        lora_config = vllm_config.lora_config
+        cache_config = vllm_config.cache_config
+
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model_class = get_model_architecture(model_config)[0]
@@ -544,12 +538,9 @@ def download_model(self, model_config: ModelConfig) -> None:
         with self.tensorizer_config.open_stream():
             pass
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
         self._verify_config(model_config, parallel_config)
 
         if parallel_config.tensor_parallel_size > 1:
@@ -559,10 +550,8 @@ def load_model(self, *, model_config: ModelConfig,
                     % get_tensor_model_parallel_rank()
 
         if is_vllm_tensorized(self.tensorizer_config):
-            return self._load_model_serialized(model_config, device_config,
-                                               lora_config, cache_config)
-        return self._load_model_serialized_cpu(model_config, device_config,
-                                               lora_config, cache_config)
+            return self._load_model_serialized(vllm_config=vllm_config)
+        return self._load_model_serialized_cpu(vllm_config=vllm_config)
 
     @staticmethod
     def save_model(
@@ -648,12 +637,9 @@ def _prepare_weights(self, model_name_or_path: str,
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         from safetensors.torch import safe_open
 
         from vllm.distributed import get_tensor_model_parallel_rank
@@ -663,8 +649,7 @@ def load_model(self, *, model_config: ModelConfig,
 
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
                 for _, module in model.named_modules():
                     quant_method = getattr(module, "quant_method", None)
                     if quant_method is not None:
@@ -1157,16 +1142,12 @@ def _load_weights(self, model_config: ModelConfig,
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
 
                 self._load_weights(model_config, model)
 
@@ -1235,13 +1216,9 @@ def _get_weights_iterator(
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
 
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
-
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         # we can only know if tie word embeddings after mapping weights
@@ -1251,8 +1228,7 @@ def load_model(self, *, model_config: ModelConfig,
 
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config)
+                model = _initialize_model(vllm_config=vllm_config)
             model.load_weights(
                 self._get_weights_iterator(local_model_path, gguf_weights_map))
         return model
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 4338cbc37f6c1..3336569f59467 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,8 +1,14 @@
 import logging
-from typing import Callable, Optional, Union
+from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import vllm.envs as envs
-from vllm.compilation.config import CompilationConfig
+
+if TYPE_CHECKING:
+    from vllm.compilation.config import CompilationConfig
+    from vllm.config import VllmConfig
+else:
+    CompilationConfig = None
+    VllmConfig = None
 
 logger = logging.getLogger(__name__)
 
@@ -55,3 +61,15 @@ def set_compilation_config(config: Optional[CompilationConfig]):
 
 def get_compilation_config() -> Optional[CompilationConfig]:
     return _compilation_config
+
+
+_vllm_config: Optional[VllmConfig] = None
+
+
+def set_vllm_config(config: Optional[VllmConfig]):
+    global _vllm_config
+    _vllm_config = config
+
+
+def get_vllm_config() -> Optional[VllmConfig]:
+    return _vllm_config
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 77c1e10ab6bdf..2510ea3700d0b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -369,13 +369,7 @@ def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             with patch("vllm.model_executor.layers.sampler.Sampler", Sampler):
-                self.model = get_model(model_config=self.model_config,
-                                       device_config=self.device_config,
-                                       load_config=self.load_config,
-                                       lora_config=self.lora_config,
-                                       parallel_config=self.parallel_config,
-                                       scheduler_config=self.scheduler_config,
-                                       cache_config=self.cache_config)
+                self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index a98faa2f2d0cb..fdd72a452f2ad 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -453,13 +453,7 @@ def model_is_mrope(self) -> bool:
         return uses_mrope(self.model_config.hf_config)
 
     def load_model(self) -> None:
-        self.model = get_model(model_config=self.model_config,
-                               load_config=self.load_config,
-                               device_config=self.device_config,
-                               lora_config=self.lora_config,
-                               parallel_config=self.parallel_config,
-                               scheduler_config=self.scheduler_config,
-                               cache_config=self.cache_config)
+        self.model = get_model(vllm_config=self.vllm_config)
 
     def make_model_input_from_broadcasted_tensor_dict(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0e200e6abb05e..328dab598f8ef 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1051,13 +1051,7 @@ def __init__(
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:
-            self.model = get_model(model_config=self.model_config,
-                                   device_config=self.device_config,
-                                   load_config=self.load_config,
-                                   lora_config=self.lora_config,
-                                   parallel_config=self.parallel_config,
-                                   scheduler_config=self.scheduler_config,
-                                   cache_config=self.cache_config)
+            self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 7d9d669a45ce3..a721186137328 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -137,15 +137,7 @@ def load_model(self) -> None:
                 "vllm.model_executor.layers.vocab_parallel_embedding."
                 "get_tensor_model_parallel_rank",
                 return_value=xm_tp_rank):
-            model = get_model(
-                model_config=self.model_config,
-                load_config=self.load_config,
-                device_config=self.device_config,
-                parallel_config=self.parallel_config,
-                cache_config=self.cache_config,
-                scheduler_config=self.scheduler_config,
-                lora_config=None,
-            )
+            model = get_model(vllm_config=self.vllm_config)
         model = model.eval()
         xm.wait_device_ops()
         self.model = ModelWrapper(model)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index f37d70bee76ed..bae8b469767b2 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -405,15 +405,7 @@ def __init__(
 
     def load_model(self) -> None:
         with DeviceMemoryProfiler() as m:
-            self.model = get_model(
-                model_config=self.model_config,
-                device_config=self.device_config,
-                load_config=self.load_config,
-                lora_config=self.lora_config,
-                parallel_config=self.parallel_config,
-                scheduler_config=self.scheduler_config,
-                cache_config=self.cache_config,
-            )
+            self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",

From 1b73ab2a1f0761a60b28aabe0456a5735de027c5 Mon Sep 17 00:00:00 2001
From: Nikita Furin <nokados@yandex.ru>
Date: Sat, 2 Nov 2024 22:50:28 +0300
Subject: [PATCH 31/85] [CI/Build] Quoting around > (#9956)

---
 Dockerfile         | 2 +-
 Dockerfile.neuron  | 2 +-
 Dockerfile.ppc64le | 2 +-
 Dockerfile.rocm    | 2 +-
 Dockerfile.tpu     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0a562253c537b..343364da2ebf5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -206,7 +206,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 0d0d8df94578c..2143315d2a078 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index cd5fcf481f07c..b19c6ddec7948 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
     pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 562117a313020..8fb79afaebe97 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -52,7 +52,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --pre \
                 torch==2.6.0.dev20240918 \
-                setuptools-scm>=8 \
+                'setuptools-scm>=8' \
                 torchvision==0.20.0.dev20240918 \
                 --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
         *) ;; esac
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index dd8f9ad4714a9..b43442e4c0af1 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -25,7 +25,7 @@ ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 

From ae5279a16385e15c07ab2bcadcbcab44367595e9 Mon Sep 17 00:00:00 2001
From: Yongzao <532741407@qq.com>
Date: Sun, 3 Nov 2024 03:56:05 +0800
Subject: [PATCH 32/85] [torch.compile] Adding torch compile to vision-language
 models (#9946)

---
 vllm/model_executor/models/llava_next.py | 10 +++++++---
 vllm/model_executor/models/minicpmv.py   |  7 ++++++-
 vllm/model_executor/models/molmo.py      | 12 ++++++++----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e8c5786066170..7a2c95594ddcd 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -606,7 +606,6 @@ def forward(
             :class:`LlavaNextImageInputs`
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
             image_input = self._parse_and_validate_image_input(**kwargs)
@@ -618,9 +617,14 @@ def forward(
                     self.language_model.model.get_input_embeddings,
                     lambda _: self._process_image_input(image_input),
                 )
-                input_ids = None
             else:
-                inputs_embeds = None
+                inputs_embeds = self.language_model.model.get_input_embeddings(
+                    input_ids)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a526a5dccd398..e7088edb97b2b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -564,8 +564,13 @@ def forward(
 
             vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
 
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
+
         output = self.llm(
-            input_ids=None,
+            input_ids=input_ids,
             positions=positions,
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 3c34227767e05..ba798833e26a9 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -15,6 +15,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.attention.selector import _Backend
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -713,6 +714,7 @@ def forward(
         return image_features
 
 
+@support_torch_compile
 class MolmoModel(nn.Module):
 
     def __init__(
@@ -1141,7 +1143,6 @@ def forward(
         **kwargs: object,
     ) -> SamplerOutput:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
         else:
             image_input = self._parse_and_validate_image_input(**kwargs)
@@ -1156,10 +1157,13 @@ def forward(
                     image_input["image_input_idx"],
                     image_input["seq_len"],
                 )
-
-                input_ids = None
             else:
-                inputs_embeds = None
+                inputs_embeds = self.model.embed_tokens(input_ids)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
 
         hidden_states = self.model(
             input_ids=input_ids,

From 3bb4befea7166850bdee3f72fe060c9c4044ba85 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 2 Nov 2024 15:54:05 -0700
Subject: [PATCH 33/85] [bugfix] fix tsts (#9959)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 2 +-
 vllm/model_executor/models/utils.py        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 2cb9e0ca7c505..2cf4e92908353 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -150,7 +150,7 @@ def _get_model_initialization_kwargs(
 
 
 def build_model(model_class: Type[nn.Module],
-                vllm_config: VllmConfig,
+                vllm_config: Optional[VllmConfig],
                 hf_config: PretrainedConfig,
                 cache_config: Optional[CacheConfig],
                 quant_config: Optional[QuantizationConfig],
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index c6ec1769fc5d1..fee97e8922a76 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -252,6 +252,7 @@ def init_vllm_registered_model(
 
     return build_model(
         model_class,
+        None,
         hf_config,
         cache_config,
         quant_config,

From 1f1b6d6eda3ea5fbdf4566632ac8a9fa61b31593 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sun, 3 Nov 2024 17:14:17 +0000
Subject: [PATCH 34/85] [V1] Support per-request seed (#9945)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
---
 vllm/v1/sample/metadata.py         |  5 +--
 vllm/v1/sample/sampler.py          | 23 +++++------
 vllm/v1/worker/gpu_model_runner.py | 61 ++++++++++++++----------------
 3 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 28614377b27b9..9ef36f2e6b212 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Dict
 
 import torch
 
@@ -16,7 +16,6 @@ class SamplingMetadata:
     no_top_p: bool
     no_top_k: bool
 
-    generators: List[Optional[torch.Generator]]
-    no_generator: bool
+    generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 157c4dd6d771e..927f274541c4d 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,5 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import List, Optional
+from typing import Dict
 
 import torch
 import torch.nn as nn
@@ -84,22 +84,21 @@ def greedy_sample(self, probs: torch.Tensor) -> torch.Tensor:
     def random_sample(
         self,
         probs: torch.Tensor,
-        generators: List[Optional[torch.Generator]],
-        no_generator: bool,
+        generators: Dict[int, torch.Generator],
     ) -> torch.Tensor:
         q = torch.empty_like(probs)
         # NOTE(woosuk): To batch-process the requests without their own seeds,
         # which is the common case, we first assume that every request does
         # not have its own seed. Then, we overwrite the values for the requests
         # that have their own seeds.
-        q.exponential_()
-        if not no_generator:
-            assert len(generators) == probs.shape[0]
+        if len(generators) != probs.shape[0]:
+            # This might still be done here unnecessarily if there are greedies
+            q.exponential_()
+        if generators:
             # TODO(woosuk): This can be slow because we handle each request
             # one by one. Optimize this.
-            for i, generator in enumerate(generators):
-                if generator is not None:
-                    q[i].exponential_(generator=generator)
+            for i, generator in generators.items():
+                q[i].exponential_(generator=generator)
         return probs.div_(q).argmax(dim=-1).view(-1)
 
     def sample(
@@ -112,13 +111,11 @@ def sample(
         if sampling_metadata.all_greedy:
             return self.greedy_sample(probs)
         if sampling_metadata.all_random:
-            return self.random_sample(probs, sampling_metadata.generators,
-                                      sampling_metadata.no_generator)
+            return self.random_sample(probs, sampling_metadata.generators)
 
         greedy_sampled = self.greedy_sample(probs)
         random_sampled = self.random_sample(probs,
-                                            sampling_metadata.generators,
-                                            sampling_metadata.no_generator)
+                                            sampling_metadata.generators)
         sampled = torch.where(
             sampling_metadata.temperature < _SAMPLING_EPS,
             greedy_sampled,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2510ea3700d0b..ae4239f8e1fab 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -128,13 +128,20 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Add new requests to the cached states.
         for req_data in scheduler_output.scheduled_new_reqs:
             req_id = req_data.req_id
+            sampling_params = req_data.sampling_params
+            if sampling_params.seed is not None:
+                generator = torch.Generator(device=self.device)
+                generator.manual_seed(sampling_params.seed)
+            else:
+                generator = None
+
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=req_data.prompt_token_ids,
                 prompt=req_data.prompt,
                 multi_modal_data=req_data.multi_modal_data,
-                sampling_params=req_data.sampling_params,
-                generator=None,  # TODO
+                sampling_params=sampling_params,
+                generator=generator,
                 block_ids=req_data.block_ids,
                 num_computed_tokens=req_data.num_computed_tokens,
                 output_token_ids=[],
@@ -342,11 +349,9 @@ def execute_model(
             else:
                 # Ignore the sampled token from the partial request.
                 # Rewind the generator state as if the token was not sampled.
-                generator = self.input_batch.generators[i]
+                generator = self.input_batch.generators.get(i)
                 if generator is not None:
-                    offset = generator.get_offset()
-                    generator = generator.set_offset(offset - 1)
-                    self.input_batch.generators[i] = generator
+                    generator.set_offset(generator.get_offset() - 1)
 
         if sampler_output.logprob_token_ids is None:
             logprob_token_ids = None
@@ -494,8 +499,8 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
-        self.generators: List[Optional[torch.Generator]] = [None
-                                                            ] * max_num_reqs
+        # req_index -> generator
+        self.generators: Dict[int, torch.Generator] = {}
 
         self.num_logprobs: Dict[str, int] = {}
         self.prompt_logprob_reqs: Set[str] = set()
@@ -509,8 +514,9 @@ def add_request(
             req_index = self.num_reqs
         assert req_index < self.max_num_reqs
 
-        self.req_ids[req_index] = request.req_id
-        self.req_id_to_index[request.req_id] = req_index
+        req_id = request.req_id
+        self.req_ids[req_index] = req_id
+        self.req_id_to_index[req_id] = req_index
 
         # Copy the prompt token ids and output token ids.
         num_prompt_tokens = len(request.prompt_token_ids)
@@ -528,27 +534,24 @@ def add_request(
         sampling_params = request.sampling_params
         self.temperature_cpu[req_index] = sampling_params.temperature
         if sampling_params.sampling_type == SamplingType.GREEDY:
-            self.greedy_reqs.add(req_index)
-        elif sampling_params.sampling_type == SamplingType.RANDOM:
-            self.random_reqs.add(req_index)
-        elif sampling_params.sampling_type == SamplingType.RANDOM_SEED:
-            # TODO(woosuk): Support per-request random seed.
-            raise NotImplementedError("Per-request seed is not supported yet.")
+            self.greedy_reqs.add(req_id)
+        else:
+            self.random_reqs.add(req_id)
 
         self.top_p_cpu[req_index] = sampling_params.top_p
         if sampling_params.top_p < 1:
-            self.top_p_reqs.add(req_index)
+            self.top_p_reqs.add(req_id)
         self.top_k_cpu[req_index] = sampling_params.top_k
         if sampling_params.top_k > 0:
-            self.top_k_reqs.add(req_index)
+            self.top_k_reqs.add(req_id)
 
         self.generators[req_index] = request.generator
 
         num_logprobs = sampling_params.logprobs
         if num_logprobs is not None and num_logprobs > 0:
-            self.num_logprobs[request.req_id] = num_logprobs
+            self.num_logprobs[req_id] = num_logprobs
         if sampling_params.prompt_logprobs:
-            self.prompt_logprob_reqs.add(req_index)
+            self.prompt_logprob_reqs.add(req_id)
 
     def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
@@ -560,7 +563,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
-        self.generators[req_index] = None
+        self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
         return req_index
@@ -612,7 +615,9 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            self.generators[empty_index] = self.generators[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
 
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
@@ -636,8 +641,7 @@ def make_sampling_metadata(
             top_k=self.top_k[:self.num_reqs],
             no_top_p=self.no_top_p,
             no_top_k=self.no_top_k,
-            generators=self.generators[:self.num_reqs],
-            no_generator=self.no_generator,
+            generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
         )
 
@@ -661,16 +665,9 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
-    @property
-    def no_generator(self) -> bool:
-        return len(self.generators) == 0
-
     @property
     def max_num_logprobs(self) -> int:
-        if self.num_logprobs:
-            return max(self.num_logprobs.values())
-        else:
-            return 0
+        return max(self.num_logprobs.values()) if self.num_logprobs else 0
 
     @property
     def no_logprob(self) -> bool:

From 54597724f4c6b52d50152f3cc46e86c101d9c820 Mon Sep 17 00:00:00 2001
From: shanshan wang <cooleel@gmail.com>
Date: Sun, 3 Nov 2024 18:15:36 -0600
Subject: [PATCH 35/85] [Model] Add support for H2OVL-Mississippi models
 (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.rst       |   6 +
 examples/offline_inference_vision_language.py |  28 +-
 ...e_inference_vision_language_multi_image.py |  35 ++
 .../vision_language/test_h2ovl.py             | 130 ++++++
 .../vision_language/test_models.py            |  17 +
 .../vision_language/vlm_utils/model_utils.py  |  60 +++
 vllm/entrypoints/chat_utils.py                |   3 +-
 vllm/model_executor/models/h2ovl.py           | 401 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   3 +-
 vllm/transformers_utils/config.py             |   2 +
 vllm/transformers_utils/configs/__init__.py   |   4 +-
 vllm/transformers_utils/configs/h2ovl.py      |  13 +
 12 files changed, 698 insertions(+), 4 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py
 create mode 100644 vllm/model_executor/models/h2ovl.py
 create mode 100644 vllm/transformers_utils/configs/h2ovl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index a5c085bb84db9..55835d945b00c 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -440,6 +440,12 @@ Text Generation
     - :code:`THUDM/glm-4v-9b` etc.
     - 
     - ✅︎
+  * - :code:`H2OVLChatModel`
+    - H2OVL
+    - T + I\ :sup:`E+`
+    - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
+    - 
+    - ✅︎
   * - :code:`InternVLChatModel`
     - InternVL2
     - T + I\ :sup:`E+`
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 60cdb186331fe..4fd002caf1763 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -176,6 +176,31 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# H2OVL-Mississippi
+def run_h2ovl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return llm, prompt, stop_token_ids
+
+
 # InternVL
 def run_internvl(question: str, modality: str):
     assert modality == "image"
@@ -363,6 +388,7 @@ def run_glm4v(question: str, modality: str):
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
+    "h2ovl_chat": run_h2ovl,
     "internvl_chat": run_internvl,
     "NVLM_D": run_nvlm_d,
     "qwen_vl": run_qwen_vl,
@@ -475,4 +501,4 @@ def main(args):
                         default=16,
                         help='Number of frames to extract from the video.')
     args = parser.parse_args()
-    main(args)
+    main(args)
\ No newline at end of file
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index e28514bf403f7..d99684078ff3d 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -107,6 +107,40 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -258,6 +292,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
 
 model_example_map = {
     "phi3_v": load_phi3v,
+    "h2ovl_chat": load_h2onvl,
     "internvl_chat": load_internvl,
     "NVLM_D": load_nvlm_d,
     "qwen2_vl": load_qwen2_vl,
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
new file mode 100644
index 0000000000000..ad9aa3104750b
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -0,0 +1,130 @@
+from typing import Optional, Tuple
+
+import pytest
+import torch
+from PIL.Image import Image
+from transformers import AutoConfig
+
+# Import the functions to test
+from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
+                                              image_to_pixel_values_wrapper)
+from vllm.multimodal.utils import rescale_image_size
+
+models = [
+    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
+    "h2oai/h2ovl-mississippi-2b",
+]
+target_dtype = "bfloat16"
+
+
+def run_preprocessing_test(
+    image: Image,
+    config,
+    max_dynamic_patch: Optional[int] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Test the image preprocessing and calculate expected blocks."""
+
+    if max_dynamic_patch is None:
+        max_dynamic_patch = config.max_dynamic_patch
+
+    width, height = image.size
+    use_MSAC = config.use_msac
+
+    # Create the mapper function with the provided configuration
+    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
+    pixel_values = mapper(image)
+
+    # Calculate the expected number of blocks
+    if use_MSAC:
+        # First pass
+        blocks1, _, _, aspect_ratio = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,  # Thumbnail is handled separately
+            prior_aspect_ratio=None,
+        )
+
+        # Second pass
+        blocks2, _, _, _ = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,
+            prior_aspect_ratio=aspect_ratio,
+        )
+
+        # Add thumbnail if use_thumbnail is True and total_blocks > 1
+        if config.use_thumbnail:
+            blocks1 += 1 if blocks1 > 1 else 0
+            blocks2 += 1 if blocks2 > 1 else 0
+
+        # Total blocks is the sum of blocks from both passes minus overlapping
+        total_blocks = blocks1 + blocks2 - 1
+
+        expected_blocks = total_blocks
+
+    else:
+        blocks, _, _, _ = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,
+            prior_aspect_ratio=None,
+        )
+        expected_blocks = blocks
+
+        if config.use_thumbnail and expected_blocks > 1:
+            expected_blocks += 1
+
+    return pixel_values, expected_blocks
+
+
+@pytest.mark.parametrize("model_name", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
+def test_image_preprocessing(image_assets, model_name, size_factors,
+                             max_dynamic_patch):
+    """Test image preprocessing pipeline with different configurations."""
+    # Load the configuration from the model
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    for asset in image_assets:
+        image = asset.pil_image
+        for factor in size_factors:
+            scaled_image = rescale_image_size(image, factor)
+
+            # Test preprocessing and get expected number of blocks
+            pixel_values, expected_blocks = run_preprocessing_test(
+                scaled_image, config, max_dynamic_patch)
+
+            # Verify output shapes and properties
+            actual_blocks = pixel_values.shape[0]
+            assert actual_blocks == expected_blocks, (
+                f"Expected {expected_blocks} blocks, got {actual_blocks}")
+
+            # Check image dimensions
+            expected_size = (
+                3,  # Number of channels (C, H, W)
+                config.vision_config.image_size,
+                config.vision_config.image_size,
+            )
+            for img in pixel_values:
+                assert img.shape == expected_size, (
+                    f"Expected image size {expected_size}, got {img.shape}")
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index e49ea6f98324d..cfd2d61f2b633 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -187,6 +187,23 @@
         marks=[large_gpu_mark(min_gb=48)],
         patch_hf_runner=model_utils.glm_patch_hf_runner,
     ),
+    "h2ovl": VLMTestInfo(
+        models = [
+            "h2oai/h2ovl-mississippi-800m",
+            "h2oai/h2ovl-mississippi-2b",
+        ],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=8192,
+        dtype="bfloat16",
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
+    ),
     "intern_vl": VLMTestInfo(
         models=[
             "OpenGVLab/InternVL2-1B",
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index e925934db0e7c..849857b4232e7 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -259,6 +259,66 @@ def processor(*args, text="", images=None, **kwargs):
     return hf_model
 
 
+def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for H2OVL."""
+
+    class H2OVLProcessor:
+        """A simple processor for H2OVL models."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+            self.dtype = hf_runner.model.dtype
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, List[Image]],
+                     **kwargs):
+            # yapf: disable
+            from vllm.model_executor.models.h2ovl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+
+            # yapf: enable
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values(image,
+                                      self.image_size,
+                                      self.min_num,
+                                      self.max_num,
+                                      self.use_thumbnail,
+                                      use_MSAC=self.config.use_msac).to(
+                                          self.dtype) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = H2OVLProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
 def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for InternVL."""
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bc2de2d162473..c9552977710d1 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -187,7 +187,8 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "internvl_chat", "NVLM_D"):
+            if model_type in ("chameleon", "internvl_chat", "NVLM_D",
+                              "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
new file mode 100644
index 0000000000000..43242fe370ba2
--- /dev/null
+++ b/vllm/model_executor/models/h2ovl.py
@@ -0,0 +1,401 @@
+# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from functools import partial
+from typing import List, Optional, Tuple
+
+import torch
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
+                         token_inputs)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.utils import is_list_of
+
+from .intern_vit import InternVisionModel
+from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, InternVLChatModel,
+                       InternVLInputPipeline, build_transform,
+                       find_closest_aspect_ratio, get_internvl_num_patches)
+
+
+# modified to include blocks generated in second pass
+def calculate_num_blocks(
+    orig_width: int,
+    orig_height: int,
+    min_num: int,
+    max_num: int,
+    image_size: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio=None,
+) -> Tuple[int, int, int, Tuple[int, int]]:
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
+                        for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # if prior_aspect_ratio is provided, filter the target ratios
+    if prior_aspect_ratio is not None:
+        target_ratios = [
+            ratio for ratio in target_ratios if prior_aspect_ratio[0] %
+            ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0
+        ]
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # add thumbnail image if num_blocks > 1
+    if use_thumbnail and blocks > 1:
+        blocks += 1
+    return blocks, target_width, target_height, target_aspect_ratio
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+# refactored to handle prior_aspect_ratio as optional
+def dynamic_preprocess(
+    image: Image.Image,
+    min_num: int,
+    max_num: int,
+    image_size: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
+) -> Tuple[List[Image.Image], Tuple[int, int]]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks based on prior aspect ratio if available
+    blocks, target_width, target_height, target_aspect_ratio = (
+        calculate_num_blocks(
+            orig_width,
+            orig_height,
+            min_num,
+            max_num,
+            image_size,
+            use_thumbnail=False,
+            prior_aspect_ratio=prior_aspect_ratio,
+        ))
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
+
+
+def load_image(
+    image: Image.Image,
+    input_size=448,
+    min_num=1,
+    max_num=6,
+    use_thumbnail=True,
+    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess(
+        image,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+        min_num=min_num,
+        max_num=max_num,
+        prior_aspect_ratio=prior_aspect_ratio,
+    )
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values, target_aspect_ratio
+
+
+# refactored to use the combined load_image function
+def image_to_pixel_values(
+    image: Image.Image,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    use_MSAC: bool,
+) -> torch.Tensor:
+    # when MSAC is turned on, we need to process the image twice
+    if use_MSAC:
+        # first pass
+        pixel_values, target_aspect_ratio = load_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            use_thumbnail=True,
+        )
+        # second pass
+        pixel_values2, _ = load_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            prior_aspect_ratio=target_aspect_ratio,
+        )
+        # combine pixel values
+        pixel_values = torch.cat(
+            [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
+
+    else:
+        pixel_values, _ = load_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            use_thumbnail=use_thumbnail,
+        )
+
+    return pixel_values
+
+
+def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
+                                  max_dynamic_patch: Optional[int] = None,
+                                  use_MSAC: Optional[bool] = None):
+    image_size = hf_config.vision_config.image_size
+    min_num = hf_config.min_dynamic_patch
+    if max_dynamic_patch is None:
+        max_dynamic_patch = hf_config.max_dynamic_patch
+    if use_MSAC is None:
+        use_MSAC = hf_config.use_msac
+    use_thumbnail = hf_config.use_thumbnail
+    return partial(
+        image_to_pixel_values,
+        input_size=image_size,
+        min_num=min_num,
+        max_num=max_dynamic_patch,
+        use_thumbnail=use_thumbnail,
+        use_MSAC=use_MSAC,
+    )
+
+
+def get_max_internvl_image_tokens(ctx: InputContext,
+                                  *,
+                                  max_dynamic_patch: Optional[int] = None):
+    """
+    Calculate the maximum number of tokens with/without MSAC and thumbnail
+    """
+    hf_config = ctx.get_hf_config()
+    use_thumbnail = hf_config.use_thumbnail
+    use_MSAC = hf_config.use_msac
+
+    if max_dynamic_patch is None:
+        max_dynamic_patch = hf_config.max_dynamic_patch
+
+    num_patches = get_internvl_num_patches(hf_config)
+
+    coefficient = 2 if use_MSAC else 1
+    num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0)
+
+    return num_blocks * num_patches
+
+
+class H2OVLInputPipeline(InternVLInputPipeline):
+    """
+    Input pipeline for processing image and text data for the H2OVL model.
+    """
+
+    def input_processor(
+        self,
+        ctx: InputContext,
+        inputs: DecoderOnlyInputs,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+    ) -> DecoderOnlyInputs:
+        # get multi_modal_data
+        multi_modal_data = inputs.get("multi_modal_data")
+        if multi_modal_data is None or "image" not in multi_modal_data:
+            return inputs
+
+        model_config = ctx.model_config
+        hf_config = ctx.get_hf_config()
+        use_MSAC = hf_config.use_msac
+
+        image_data = multi_modal_data["image"]
+        num_patches = get_internvl_num_patches(hf_config)
+
+        image_pixel_values_mapper = image_to_pixel_values_wrapper(
+            hf_config, max_dynamic_patch=max_dynamic_patch)
+
+        # single image
+        if isinstance(image_data, Image.Image):
+            pixel_values = image_pixel_values_mapper(image_data,
+                                                     use_MSAC=use_MSAC)
+            num_blocks = pixel_values.shape[0]
+            image_feature_sizes = [num_blocks * num_patches]
+            pixel_values = pixel_values.unsqueeze(0)
+
+        # multi images
+        elif is_list_of(image_data, Image.Image):
+            # Do not use MSAC for multi images
+            image_feature_sizes = []
+            pixel_values = [
+                image_pixel_values_mapper(image, use_MSAC=False)
+                for image in image_data
+            ]
+            for pixel_value in pixel_values:
+                num_blocks = pixel_value.shape[0]
+                image_feature_sizes.append(num_blocks * num_patches)
+
+        # image embeddings as input
+        elif isinstance(image_data, torch.Tensor):
+            _, image_feature_size, _ = image_data.shape
+            image_feature_sizes = [image_feature_size]
+            pixel_values = None
+
+        # multi-image image embeddings
+        elif is_list_of(image_data, torch.Tensor):
+
+            image_feature_sizes = []
+            for image_embed in image_data:
+                _, image_feature_size, _ = image_embed.shape
+                image_feature_sizes.append(image_feature_size)
+            pixel_values = None
+
+        else:
+            raise TypeError(f"Invalid image type: {type(image_data)}")
+
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+
+        prompt = inputs.get("prompt")
+        prompt_token_ids = inputs["prompt_token_ids"]
+        if prompt is None:
+            prompt = tokenizer.decode(prompt_token_ids)
+
+        new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
+                                               num_patches)
+        new_prompt_token_ids = tokenizer.encode(new_prompt)
+
+        # Wrap image processing in input_processor to avoid duplication
+        image_token_id = tokenizer.encode(
+            self.img_context_token,
+            add_special_tokens=False,
+            return_tensors="pt",
+        )[0]
+
+        # Update multi_modal_data to return
+        if pixel_values is not None:
+            multi_modal_data = {
+                "image": {
+                    "pixel_values": pixel_values,
+                    "image_token_id": image_token_id,
+                }
+            }
+        else:
+            multi_modal_data = {"image": {"image_embeds": image_data}}
+
+        return token_inputs(
+            prompt=prompt,
+            prompt_token_ids=new_prompt_token_ids,
+            multi_modal_data=multi_modal_data,
+        )
+
+    def input_mapper(
+        self,
+        ctx: InputContext,
+        data: object,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+    ) -> MultiModalInputs:
+
+        # NOTE: Preprocessing for the image data is done in the
+        # 'input_processor' function during actual inference.
+        if isinstance(data, dict):
+            return MultiModalInputs(data)
+
+        # The section below is only used with dummy data during
+        # memory profiling.
+        hf_config = ctx.get_hf_config()
+
+        image_pixel_values_mapper = image_to_pixel_values_wrapper(
+            hf_config, max_dynamic_patch)
+
+        if isinstance(data, Image.Image):
+            pixel_values = image_pixel_values_mapper(data)
+            pixel_values = pixel_values.unsqueeze(0)
+
+        elif is_list_of(data, Image.Image):
+            hf_config.use_msac = False
+            pixel_values = [image_pixel_values_mapper(img) for img in data]
+
+        else:
+            return MultiModalInputs({"image_embeds": data})
+        model_config = ctx.model_config
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+        image_token_id = tokenizer.encode(
+            self.img_context_token,
+            add_special_tokens=False,
+            return_tensors="pt",
+        )[0]
+
+        return MultiModalInputs({
+            "pixel_values": pixel_values,
+            "image_token_id": image_token_id
+        })
+
+
+input_pipeline = H2OVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
+@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
+class H2OVLChatModel(InternVLChatModel):
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = (config.vision_config.num_hidden_layers +
+                                     vision_feature_layer + 1)
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            msg = "Monolith mode is not applicable to H2OVL"
+            raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f50ceaccb1bbe..3a929f5cb5195 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -128,6 +128,7 @@ def add_embedding_models(base_models, embedding_models):
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
+    "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
@@ -482,4 +483,4 @@ def _run() -> None:
 
 
 if __name__ == "__main__":
-    _run()
+    _run()
\ No newline at end of file
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9bd2531d7a15c..08697274854e0 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -19,6 +19,7 @@
 # yapf: disable
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                              EAGLEConfig, ExaoneConfig,
+                                             H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,6 +53,7 @@
     "medusa": MedusaConfig,
     "eagle": EAGLEConfig,
     "exaone": ExaoneConfig,
+    "h2ovl_chat": H2OVLChatConfig,
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index f0d79197a82c5..d1e19c9a33c24 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -6,6 +6,7 @@
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
 from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
@@ -22,6 +23,7 @@
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
+    "H2OVLChatConfig",
     "InternVLChatConfig",
     "JAISConfig",
     "MedusaConfig",
@@ -33,4 +35,4 @@
     "NVLM_D_Config",
     "SolarConfig",
     "UltravoxConfig",
-]
+]
\ No newline at end of file
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
new file mode 100644
index 0000000000000..b94c5b77e4b7f
--- /dev/null
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -0,0 +1,13 @@
+# Adapted from
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .internvl import InternVLChatConfig
+
+
+class H2OVLChatConfig(InternVLChatConfig):
+    model_type = "h2ovl_chat"

From 91c9ebbb1bfc39e98aa2bd444b9569e5f2f92c9e Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 3 Nov 2024 19:24:40 -0500
Subject: [PATCH 36/85] [V1] Fix Configs (#9971)

---
 vllm/v1/executor/gpu_executor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/gpu_executor.py
index de56332240192..f71fa16b16e27 100644
--- a/vllm/v1/executor/gpu_executor.py
+++ b/vllm/v1/executor/gpu_executor.py
@@ -1,7 +1,7 @@
 import os
 from typing import Optional, Tuple
 
-from vllm.config import EngineConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.outputs import ModelRunnerOutput
@@ -12,7 +12,8 @@
 
 class GPUExecutor:
 
-    def __init__(self, vllm_config: EngineConfig) -> None:
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config

From c49f0407ba60bfee538892a09561c1fe7484adf8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 4 Nov 2024 11:36:41 +0800
Subject: [PATCH 37/85] [Bugfix] Fix MiniCPMV and Mllama BNB  bug (#9917)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/layers/resampler.py    |  49 +++++----
 vllm/model_executor/model_loader/loader.py |  34 ++++--
 vllm/model_executor/models/minicpmv.py     | 120 ++++++++++++++-------
 vllm/model_executor/models/mllama.py       |   7 +-
 4 files changed, 145 insertions(+), 65 deletions(-)

diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index 8cd938fc85fb2..bce91f1d7fd5e 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -41,6 +41,7 @@
 from torch.nn.init import trunc_normal_
 
 from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization import QuantizationConfig
 
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
@@ -154,15 +155,15 @@ class BaseResampler(nn.Module):
         A tensor with the shape of (grid_size**2, embed_dim)
     """
 
-    def __init__(
-        self,
-        num_queries: int,
-        embed_dim: int,
-        num_heads: int,
-        kv_dim: Optional[int] = None,
-        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-        do_post_projection: bool = True,
-    ) -> None:
+    def __init__(self,
+                 num_queries: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 do_post_projection: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
         super().__init__()
 
         self.num_queries = num_queries
@@ -172,7 +173,11 @@ def __init__(
         self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
         trunc_normal_(self.query, std=0.02)
         if kv_dim is not None and kv_dim != embed_dim:
-            self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False)
+            self.kv_proj = ReplicatedLinear(kv_dim,
+                                            embed_dim,
+                                            bias=False,
+                                            quant_config=quant_config,
+                                            prefix=prefix)
         else:
             # Maintain the same return value with ReplicatedLinear.forward
             self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa 
@@ -209,22 +214,24 @@ class Resampler2(BaseResampler):
     present in minicpmv2.0, but not qwen-vl.
     """
 
-    def __init__(
-        self,
-        grid_size: int,
-        embed_dim: int,
-        num_heads: int,
-        kv_dim: Optional[int] = None,
-        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-        adaptive: bool = False,
-        do_post_projection: bool = True,
-    ) -> None:
+    def __init__(self,
+                 grid_size: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 adaptive: bool = False,
+                 do_post_projection: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
         super().__init__(grid_size**2,
                          embed_dim,
                          num_heads,
                          kv_dim,
                          norm_layer,
-                         do_post_projection=do_post_projection)
+                         do_post_projection=do_post_projection,
+                         quant_config=quant_config,
+                         prefix=prefix)
 
         self.adaptive = adaptive
         pos_embed_arr = get_2d_sincos_pos_embed(embed_dim,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 2cf4e92908353..07adf7c01eaaf 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -28,6 +28,7 @@
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.model_loader.tensorizer import (
@@ -771,6 +772,8 @@ def __init__(self, load_config: LoadConfig):
         with open(config_file_path, "r") as f:
             config = json.load(f)
             self.target_modules = config["target_modules"]
+        # Save the module names without sharding.
+        self.unsharded_weights_modules: List[str] = []
 
     def _get_config_file(self, qlora_adapter: str) -> str:
         is_local = os.path.isdir(qlora_adapter)
@@ -990,16 +993,21 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
             if any(target_module in weight_name for target_module in
                    self.target_modules) and weight_name.endswith(".weight"):
                 weight_name = weight_name.replace(".weight", ".qweight")
-
-                if any(module in weight_name
-                       for module in self.column_parallel_weights_modules):
+                # Without sharding
+                if any(
+                        weight_name.startswith(module)
+                        for module in self.unsharded_weights_modules):
+                    weight_sub_tensor = weight_tensor
+                # Shard by column
+                elif any(module in weight_name
+                         for module in self.column_parallel_weights_modules):
 
                     total_size = weight_tensor.size(-1)
                     start_index = total_size // tp_size * tp_rank
                     end_index = total_size // tp_size * (tp_rank + 1)
                     weight_sub_tensor = weight_tensor[...,
                                                       start_index:end_index]
-
+                # Shard by row
                 else:
                     total_size = weight_tensor.size(0)
                     start_index = total_size // tp_size * tp_rank
@@ -1053,7 +1061,15 @@ def _load_weights(self, model_config: ModelConfig,
                 model.column_parallel_weights_modules
         else:
             self.column_parallel_weights_modules = []
-
+        # Some modules like `ReplicatedLinear` should not have their weights
+        # sharded. The reason for implementing it this way is to avoid new
+        # static variable in the model implementation.
+        # TODO: Can we reduce the static variables needed for BNB based on
+        #  model information?
+        self.unsharded_weights_modules = [
+            name for name, module in model.named_modules()
+            if isinstance(module, (ReplicatedLinear, ))
+        ]
         self.model_type = type(model).__name__
 
         logger.info("Loading weights with BitsAndBytes quantization. "
@@ -1100,7 +1116,13 @@ def _load_weights(self, model_config: ModelConfig,
             for shard_name, (
                     weight_name, index
             ) in model.bitsandbytes_stacked_params_mapping.items():
-                if shard_name in quant_param_name:
+
+                shard_pos = quant_param_name.find(shard_name)
+                # Some models, such as MiniCPM V2.5/2.6, contain both
+                # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
+                # from being incorrectly identified as being present in
+                # 'vpm.encoder.layers.0.self_attn.qkv_proj.qweight
+                if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".":
                     shard_index = index
                     quant_param_name = quant_param_name.replace(
                         shard_name, weight_name)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index e7088edb97b2b..c1f714bb25680 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -131,16 +131,22 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 
 class Resampler2_5(BaseResampler):
 
-    def __init__(
-            self,
-            num_queries: int,
-            embed_dim: int,
-            num_heads: int,
-            kv_dim: Optional[int] = None,
-            norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-            max_size: Tuple[int, int] = (70, 70),
-    ) -> None:
-        super().__init__(num_queries, embed_dim, num_heads, kv_dim, norm_layer)
+    def __init__(self,
+                 num_queries: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 max_size: Tuple[int, int] = (70, 70),
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__(num_queries,
+                         embed_dim,
+                         num_heads,
+                         kv_dim,
+                         norm_layer,
+                         quant_config=quant_config,
+                         prefix=prefix)
 
         self.max_size = max_size
         self._set_2d_pos_cache(self.max_size)
@@ -404,7 +410,10 @@ def __init__(
         self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
                            self.vpm.embeddings.embed_dim)
         self.embed_dim = self.config.hidden_size
-        self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
+        self.resampler = self.init_resampler(self.embed_dim,
+                                             self.vision_dim,
+                                             quant_config=quant_config,
+                                             prefix="resampler")
         self.resampler.to(device="cuda", dtype=param_dtype)
         # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm
         self.lm_head = ParallelLMHead(config.vocab_size,
@@ -666,7 +675,11 @@ def init_vision_module(
     ) -> nn.Module:
         raise NotImplementedError
 
-    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
         raise NotImplementedError
 
     def get_vision_embedding(
@@ -743,16 +756,21 @@ def init_vision_module(
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.embed_tokens(input_ids)
 
-    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
         with set_default_torch_dtype(torch.float16):
-            resampler = Resampler2(
-                embed_dim=embed_dim,
-                num_heads=embed_dim // 128,
-                grid_size=int(math.sqrt(self.config.query_num)),
-                kv_dim=vision_dim,
-                adaptive=False,
-                do_post_projection=True,
-            )
+            resampler = Resampler2(embed_dim=embed_dim,
+                                   num_heads=embed_dim // 128,
+                                   grid_size=int(
+                                       math.sqrt(self.config.query_num)),
+                                   kv_dim=vision_dim,
+                                   adaptive=False,
+                                   do_post_projection=True,
+                                   quant_config=quant_config,
+                                   prefix=prefix)
 
         return resampler
 
@@ -825,9 +843,21 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         ".k_proj.",
         ".v_proj.",
         ".o_proj.",
+        # vision encoder
+        ".fc1.",
+        ".fc2.",
+        # Currently, vllm does not support BNB quantization for the `out_proj`
+        # of the resampler, so it's necessary to distinguish between the
+        # vision encoder and the resampler's out_proj. The same applies to
+        # MiniCPMV2_6.
+        ".self_attn.out_proj.",  #  vision encoder out_proj
+        # resampler
+        ".kv_proj.",
     ]
     # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    column_parallel_weights_modules = [
+        ".down_proj.", ".o_proj.", ".self_attn.out_proj.", ".fc2."
+    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -877,14 +907,18 @@ def init_vision_module(
             model.encoder.layers = model.encoder.layers[:-1]
         return model
 
-    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
         with set_default_torch_dtype(torch.float16):
-            resampler = Resampler2_5(
-                num_queries=self.config.query_num,
-                embed_dim=embed_dim,
-                num_heads=embed_dim // 128,
-                kv_dim=vision_dim,
-            )
+            resampler = Resampler2_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
         return resampler
 
     def get_vision_embedding(
@@ -967,9 +1001,17 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         ".k_proj.",
         ".v_proj.",
         ".o_proj.",
+        # vision encoder
+        ".fc1.",
+        ".fc2.",
+        ".self_attn.out_proj.",
+        # resampler
+        ".kv_proj.",
     ]
     # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    column_parallel_weights_modules = [
+        ".down_proj.", ".o_proj.", ".self_attn.out_proj.", ".fc2."
+    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -1019,15 +1061,19 @@ def init_vision_module(
             model.encoder.layers = model.encoder.layers[:-1]
         return model
 
-    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
         with set_default_torch_dtype(torch.float16):
             # The resampler in 2.6 remains consistent with the one in 2.5.
-            resampler = Resampler2_5(
-                num_queries=self.config.query_num,
-                embed_dim=embed_dim,
-                num_heads=embed_dim // 128,
-                kv_dim=vision_dim,
-            )
+            resampler = Resampler2_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
         return resampler
 
     def get_vision_embedding(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 19c3827e43703..a03155ac32a61 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1056,9 +1056,14 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
         ".k_proj.",
         ".v_proj.",
         ".o_proj.",
+        ".fc1.",
+        ".fc2.",
+        # The `multi_modal_projector` is at the top level of the model,
+        # so we can't add a dot in front of it.
+        "multi_modal_projector."
     ]
     # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    column_parallel_weights_modules = [".down_proj.", ".o_proj.", ".fc2."]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

From b67feb12749ef8c01ef77142c3cd534bb3d87eda Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 4 Nov 2024 01:19:51 -0500
Subject: [PATCH 38/85] [Bugfix]Using the correct type hints (#9885)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/sequence.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index ee547dde45394..44a9257c9a4c1 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -6,7 +6,8 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional
+from typing import (TYPE_CHECKING, Any, Callable, DefaultDict, Dict, List,
+                    Mapping, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Tuple, Union, cast
 
@@ -256,7 +257,8 @@ def output_token_ids(self) -> Tuple[int, ...]:
         return tuple(self._output_token_ids)
 
     @output_token_ids.setter
-    def output_token_ids(self, new_output_token_ids: List[int]) -> None:
+    def output_token_ids(self,
+                         new_output_token_ids: GenericSequence[int]) -> None:
         self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
                                        new_output_token_ids)
         self._update_cached_all_tokens()
@@ -1173,7 +1175,7 @@ def get_all_seq_ids_and_request_ids(
     sequence ids.
     """
     seq_ids: List[int] = []
-    request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+    request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set)
     for sg in seq_group_metadata_list:
         for seq_id in sg.seq_data:
             seq_ids.append(seq_id)

From 4dbcbbeb09628eb3181dedb6789f0ccb05e83957 Mon Sep 17 00:00:00 2001
From: Yang Zheng <50227060+zhengy001@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:54:37 +0800
Subject: [PATCH 39/85] [Misc] Compute query_start_loc/seq_start_loc on CPU
 (#9447)

Co-authored-by: Yang Zheng(SW)(Alex) <you@example.com>
---
 vllm/attention/backends/flash_attn.py | 28 ++++++++++-----------------
 vllm/attention/backends/utils.py      | 28 ++++++++++-----------------
 2 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 2975a41797e9f..26da0d89def29 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,6 +1,7 @@
 """Attention layer with FlashAttention."""
 from collections import defaultdict
 from dataclasses import dataclass
+from itertools import accumulate
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import torch
@@ -503,6 +504,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
         num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
 
         num_seqs = len(seq_lens)
         if use_captured_graph:
@@ -525,29 +528,18 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                                device, self.runner.pin_memory)
         seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
                                            self.runner.pin_memory)
-        query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device,
-                                             self.runner.pin_memory)
         slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
                                                device, self.runner.pin_memory)
-        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=device)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
         placeholder_index_maps = {
             modality: placeholder_map.index_map()
             for modality, placeholder_map in
             self.multimodal_placeholder_maps.items()
         }
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=query_start_loc.dtype,
-                     out=query_start_loc[1:])
 
         return FlashAttentionMetadata(
             num_prefills=self.num_prefills,
@@ -561,8 +553,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             max_decode_query_len=max_decode_query_len,
             max_prefill_seq_len=max_prefill_seq_len,
             max_decode_seq_len=max_decode_seq_len,
-            query_start_loc=query_start_loc,
-            seq_start_loc=seq_start_loc,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 096c920c4833a..12800668af223 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,6 +1,7 @@
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
+from itertools import accumulate
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
 
 import numpy as np
@@ -216,6 +217,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
         num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
 
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
@@ -244,29 +247,18 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                                device, self.runner.pin_memory)
         seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
                                            self.runner.pin_memory)
-        query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device,
-                                             self.runner.pin_memory)
         slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
                                                device, self.runner.pin_memory)
-        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=device)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
         placeholder_index_maps = {
             modality: placeholder_map.index_map()
             for modality, placeholder_map in
             self.multimodal_placeholder_maps.items()
         }
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=query_start_loc.dtype,
-                     out=query_start_loc[1:])
 
         return self._metadata_cls(  # type: ignore
             num_prefills=self.num_prefills,
@@ -279,8 +271,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             max_query_len=max_query_len,
             max_prefill_seq_len=max_prefill_seq_len,
             max_decode_seq_len=max_decode_seq_len,
-            query_start_loc=query_start_loc,
-            seq_start_loc=seq_start_loc,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,

From ea4adeddc12412ad0854f93882e214000e91ce05 Mon Sep 17 00:00:00 2001
From: Tran Quang Dai <62875701+daitran2k1@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:37:58 +0700
Subject: [PATCH 40/85] [Bugfix] Fix E2EL mean and median stats (#9984)

Signed-off-by: daitran2k1 <tranquangdai7a@gmail.com>
---
 benchmarks/benchmark_serving.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 0d205014b15bf..ff06622628219 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -406,9 +406,9 @@ def calculate_metrics(
         median_itl_ms=np.median(itls or 0) * 1000,
         percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
                             for p in selected_percentiles],
-        mean_e2el_ms=np.median(e2els or 0) * 1000,
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
         std_e2el_ms=np.std(e2els or 0) * 1000,
-        median_e2el_ms=np.mean(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
         percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
                              for p in selected_percentiles],
     )

From ccb5376a9a88bb6251c4434b79c173151e6f7729 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Mon, 4 Nov 2024 18:14:13 +0800
Subject: [PATCH 41/85] [Bugfix][OpenVINO] Fix circular reference #9939 (#9974)

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm/platforms/openvino.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 35dbe22abf7ff..31fe3f1fcbfe4 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -1,10 +1,12 @@
 import torch
 
 import vllm.envs as envs
-from vllm.utils import print_warning_once
+from vllm.logger import init_logger
 
 from .interface import Platform, PlatformEnum
 
+logger = init_logger(__name__)
+
 
 class OpenVinoPlatform(Platform):
     _enum = PlatformEnum.OPENVINO
@@ -27,5 +29,5 @@ def is_openvino_gpu(self) -> bool:
 
     @classmethod
     def is_pin_memory_available(self) -> bool:
-        print_warning_once("Pin memory is not supported on OpenViNO.")
+        logger.warning("Pin memory is not supported on OpenViNO.")
         return False

From ac6b8f19b9007433b9cbf057c1d01ae9d4efdad5 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Mon, 4 Nov 2024 23:34:57 +0800
Subject: [PATCH 42/85] [Frontend] Multi-Modality Support for Loading Local
 Image Files (#9915)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/multimodal/test_utils.py | 39 +++++++++++++++++-
 vllm/config.py                 |  8 ++++
 vllm/engine/arg_utils.py       |  9 ++++
 vllm/entrypoints/chat_utils.py |  9 +++-
 vllm/entrypoints/llm.py        |  6 +++
 vllm/multimodal/utils.py       | 75 +++++++++++++++++++++++++++++-----
 6 files changed, 132 insertions(+), 14 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 69f04f0a69c0b..9869c8123f001 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,11 +1,12 @@
 import base64
 import mimetypes
-from tempfile import NamedTemporaryFile
+import os
+from tempfile import NamedTemporaryFile, TemporaryDirectory
 from typing import Dict, Tuple
 
 import numpy as np
 import pytest
-from PIL import Image
+from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
 from vllm.multimodal.utils import (async_fetch_image, fetch_image,
@@ -84,6 +85,40 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         assert _image_equals(data_image_sync, data_image_async)
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_local_files(image_url: str):
+    with TemporaryDirectory() as temp_dir:
+        origin_image = fetch_image(image_url)
+        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
+                          quality=100,
+                          icc_profile=origin_image.info.get('icc_profile'))
+
+        image_async = await async_fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}",
+            allowed_local_media_path=temp_dir)
+
+        image_sync = fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}",
+            allowed_local_media_path=temp_dir)
+        # Check that the images are equal
+        assert not ImageChops.difference(image_sync, image_async).getbbox()
+
+        with pytest.raises(ValueError):
+            await async_fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}",
+                allowed_local_media_path=temp_dir)
+        with pytest.raises(ValueError):
+            await async_fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+
+        with pytest.raises(ValueError):
+            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}",
+                        allowed_local_media_path=temp_dir)
+        with pytest.raises(ValueError):
+            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
+
+
 @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 def test_repeat_and_pad_placeholder_tokens(model):
     config = AutoConfig.from_pretrained(model)
diff --git a/vllm/config.py b/vllm/config.py
index 17e9b1c100498..0870eb9f70709 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -55,6 +55,10 @@ class ModelConfig:
             "mistral" will always use the tokenizer from `mistral_common`.
         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
             downloading the model and tokenizer.
+        allowed_local_media_path: Allowing API requests to read local images or
+            videos from directories specified by the server file system.
+            This is a security risk. Should only be enabled in trusted
+            environments.
         dtype: Data type for model weights and activations. The "auto" option
             will use FP16 precision for FP32 and FP16 models, and BF16 precision
             for BF16 models.
@@ -134,6 +138,7 @@ def __init__(
             trust_remote_code: bool,
             dtype: Union[str, torch.dtype],
             seed: int,
+            allowed_local_media_path: str = "",
             revision: Optional[str] = None,
             code_revision: Optional[str] = None,
             rope_scaling: Optional[dict] = None,
@@ -164,6 +169,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
         self.trust_remote_code = trust_remote_code
+        self.allowed_local_media_path = allowed_local_media_path
         self.seed = seed
         self.revision = revision
         self.code_revision = code_revision
@@ -1319,6 +1325,8 @@ def maybe_create_spec_config(
                 tokenizer=target_model_config.tokenizer,
                 tokenizer_mode=target_model_config.tokenizer_mode,
                 trust_remote_code=target_model_config.trust_remote_code,
+                allowed_local_media_path=target_model_config.
+                allowed_local_media_path,
                 dtype=target_model_config.dtype,
                 seed=target_model_config.seed,
                 revision=draft_revision,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index da06ab186821e..bd39e72d58caa 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -92,6 +92,7 @@ class EngineArgs:
     tokenizer_mode: str = 'auto'
     chat_template_text_format: str = 'string'
     trust_remote_code: bool = False
+    allowed_local_media_path: str = ""
     download_dir: Optional[str] = None
     load_format: str = 'auto'
     config_format: ConfigFormat = ConfigFormat.AUTO
@@ -269,6 +270,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--trust-remote-code',
                             action='store_true',
                             help='Trust remote code from huggingface.')
+        parser.add_argument(
+            '--allowed-local-media-path',
+            type=str,
+            help="Allowing API requests to read local images or videos"
+            "from directories specified by the server file system."
+            "This is a security risk."
+            "Should only be enabled in trusted environments")
         parser.add_argument('--download-dir',
                             type=nullable_str,
                             default=EngineArgs.download_dir,
@@ -920,6 +928,7 @@ def create_model_config(self) -> ModelConfig:
             tokenizer_mode=self.tokenizer_mode,
             chat_template_text_format=self.chat_template_text_format,
             trust_remote_code=self.trust_remote_code,
+            allowed_local_media_path=self.allowed_local_media_path,
             dtype=self.dtype,
             seed=self.seed,
             revision=self.revision,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c9552977710d1..8da08d4b2c93c 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -307,7 +307,9 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
         self._tracker = tracker
 
     def parse_image(self, image_url: str) -> None:
-        image = get_and_parse_image(image_url)
+        image = get_and_parse_image(image_url,
+                                    allowed_local_media_path=self._tracker.
+                                    _model_config.allowed_local_media_path)
 
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
@@ -327,7 +329,10 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         self._tracker = tracker
 
     def parse_image(self, image_url: str) -> None:
-        image_coro = async_get_and_parse_image(image_url)
+        image_coro = async_get_and_parse_image(
+            image_url,
+            allowed_local_media_path=self._tracker._model_config.
+            allowed_local_media_path)
 
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 3d62cb3598477..b18974c5a0c57 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -58,6 +58,10 @@ class LLM:
             from the input.
         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
             downloading the model and tokenizer.
+        allowed_local_media_path: Allowing API requests to read local images
+            or videos from directories specified by the server file system.
+            This is a security risk. Should only be enabled in trusted
+            environments.
         tensor_parallel_size: The number of GPUs to use for distributed
             execution with tensor parallelism.
         dtype: The data type for the model weights and activations. Currently,
@@ -139,6 +143,7 @@ def __init__(
         tokenizer_mode: str = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
+        allowed_local_media_path: str = "",
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
         quantization: Optional[str] = None,
@@ -179,6 +184,7 @@ def __init__(
             tokenizer_mode=tokenizer_mode,
             skip_tokenizer_init=skip_tokenizer_init,
             trust_remote_code=trust_remote_code,
+            allowed_local_media_path=allowed_local_media_path,
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
             quantization=quantization,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c5ff552e06099..283c23c94d330 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,4 +1,5 @@
 import base64
+import os
 from functools import lru_cache
 from io import BytesIO
 from typing import Any, List, Optional, Tuple, TypeVar, Union
@@ -18,19 +19,60 @@
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
-def _load_image_from_bytes(b: bytes):
+def _load_image_from_bytes(b: bytes) -> Image.Image:
     image = Image.open(BytesIO(b))
     image.load()
     return image
 
 
-def _load_image_from_data_url(image_url: str):
+def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool:
+    # Get the common path
+    common_path = os.path.commonpath([
+        os.path.abspath(image_path),
+        os.path.abspath(allowed_local_media_path)
+    ])
+    # Check if the common path is the same as allowed_local_media_path
+    return common_path == os.path.abspath(allowed_local_media_path)
+
+
+def _load_image_from_file(image_url: str,
+                          allowed_local_media_path: str) -> Image.Image:
+    if not allowed_local_media_path:
+        raise ValueError("Invalid 'image_url': Cannot load local files without"
+                         "'--allowed-local-media-path'.")
+    if allowed_local_media_path:
+        if not os.path.exists(allowed_local_media_path):
+            raise ValueError(
+                "Invalid '--allowed-local-media-path': "
+                f"The path {allowed_local_media_path} does not exist.")
+        if not os.path.isdir(allowed_local_media_path):
+            raise ValueError(
+                "Invalid '--allowed-local-media-path': "
+                f"The path {allowed_local_media_path} must be a directory.")
+
+    # Only split once and assume the second part is the image path
+    _, image_path = image_url.split("file://", 1)
+    if not _is_subpath(image_path, allowed_local_media_path):
+        raise ValueError(
+            f"Invalid 'image_url': The file path {image_path} must"
+            " be a subpath of '--allowed-local-media-path'"
+            f" '{allowed_local_media_path}'.")
+
+    image = Image.open(image_path)
+    image.load()
+    return image
+
+
+def _load_image_from_data_url(image_url: str) -> Image.Image:
     # Only split once and assume the second part is the base64 encoded image
     _, image_base64 = image_url.split(",", 1)
     return load_image_from_base64(image_base64)
 
 
-def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image:
+def fetch_image(image_url: str,
+                *,
+                image_mode: str = "RGB",
+                allowed_local_media_path: str = "") -> Image.Image:
     """
     Load a PIL image from a HTTP or base64 data URL.
 
@@ -43,16 +85,19 @@ def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image:
 
     elif image_url.startswith('data:image'):
         image = _load_image_from_data_url(image_url)
+    elif image_url.startswith('file://'):
+        image = _load_image_from_file(image_url, allowed_local_media_path)
     else:
         raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image' or 'http'.")
+                         "with either 'data:image', 'file://' or 'http'.")
 
     return image.convert(image_mode)
 
 
 async def async_fetch_image(image_url: str,
                             *,
-                            image_mode: str = "RGB") -> Image.Image:
+                            image_mode: str = "RGB",
+                            allowed_local_media_path: str = "") -> Image.Image:
     """
     Asynchronously load a PIL image from a HTTP or base64 data URL.
 
@@ -65,9 +110,11 @@ async def async_fetch_image(image_url: str,
 
     elif image_url.startswith('data:image'):
         image = _load_image_from_data_url(image_url)
+    elif image_url.startswith('file://'):
+        image = _load_image_from_file(image_url, allowed_local_media_path)
     else:
         raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image' or 'http'.")
+                         "with either 'data:image', 'file://' or 'http'.")
 
     return image.convert(image_mode)
 
@@ -126,8 +173,12 @@ def get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
     return {"audio": (audio, sr)}
 
 
-def get_and_parse_image(image_url: str) -> MultiModalDataDict:
-    image = fetch_image(image_url)
+def get_and_parse_image(
+        image_url: str,
+        *,
+        allowed_local_media_path: str = "") -> MultiModalDataDict:
+    image = fetch_image(image_url,
+                        allowed_local_media_path=allowed_local_media_path)
     return {"image": image}
 
 
@@ -136,8 +187,12 @@ async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
     return {"audio": (audio, sr)}
 
 
-async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
-    image = await async_fetch_image(image_url)
+async def async_get_and_parse_image(
+        image_url: str,
+        *,
+        allowed_local_media_path: str = "") -> MultiModalDataDict:
+    image = await async_fetch_image(
+        image_url, allowed_local_media_path=allowed_local_media_path)
     return {"image": image}
 
 

From 8d72bb20fae1a8a9d6ec6dcb2a833a190e1225d3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 4 Nov 2024 08:51:31 -0800
Subject: [PATCH 43/85] [4/N] make quant config first-class citizen (#9978)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/config.py                             | 38 ++++++++++++++++++++++
 vllm/model_executor/model_loader/loader.py | 34 ++-----------------
 2 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 0870eb9f70709..814e00c8785f0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -23,9 +23,13 @@
     from ray.util.placement_group import PlacementGroup
 
     from vllm.executor.executor_base import ExecutorBase
+    from vllm.model_executor.layers.quantization.base_config import (
+        QuantizationConfig)
     from vllm.model_executor.model_loader.loader import BaseModelLoader
     from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
         BaseTokenizerGroup)
+else:
+    QuantizationConfig = None
 
 logger = init_logger(__name__)
 
@@ -1966,6 +1970,35 @@ class VllmConfig:
     decoding_config: Optional[DecodingConfig] = None
     observability_config: Optional[ObservabilityConfig] = None
     prompt_adapter_config: Optional[PromptAdapterConfig] = None
+    quant_config: Optional[QuantizationConfig] = None
+
+    @staticmethod
+    def _get_quantization_config(
+            model_config: ModelConfig,
+            load_config: LoadConfig) -> Optional[QuantizationConfig]:
+        """Get the quantization config."""
+        if model_config.quantization is not None:
+            from vllm.model_executor.model_loader.weight_utils import (
+                get_quant_config)
+            quant_config = get_quant_config(model_config, load_config)
+            capability_tuple = current_platform.get_device_capability()
+
+            if capability_tuple is not None:
+                capability = capability_tuple.to_int()
+                if capability < quant_config.get_min_capability():
+                    raise ValueError(
+                        f"The quantization method {model_config.quantization} "
+                        "is not supported for the current GPU. Minimum "
+                        f"capability: {quant_config.get_min_capability()}. "
+                        f"Current capability: {capability}.")
+            supported_dtypes = quant_config.get_supported_act_dtypes()
+            if model_config.dtype not in supported_dtypes:
+                raise ValueError(
+                    f"{model_config.dtype} is not supported for quantization "
+                    f"method {model_config.quantization}. Supported dtypes: "
+                    f"{supported_dtypes}")
+            return quant_config
+        return None
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
@@ -1983,3 +2016,8 @@ def __post_init__(self):
         if self.prompt_adapter_config:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
+
+        if self.quant_config is None and \
+            self.model_config is not None and self.load_config is not None:
+            self.quant_config = VllmConfig._get_quantization_config(
+                self.model_config, self.load_config)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 07adf7c01eaaf..5edb951343ae0 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -39,7 +39,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
-    get_gguf_extra_tensor_names, get_quant_config, gguf_quant_weights_iterator,
+    get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
     initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
     safetensors_weights_iterator)
 from vllm.model_executor.models import (has_inner_state, supports_lora,
@@ -93,32 +93,6 @@ def device_loading_context(module: torch.nn.Module,
 logger = init_logger(__name__)
 
 
-def _get_quantization_config(
-        model_config: ModelConfig,
-        load_config: LoadConfig) -> Optional[QuantizationConfig]:
-    """Get the quantization config."""
-    if model_config.quantization is not None:
-        quant_config = get_quant_config(model_config, load_config)
-        capability_tuple = current_platform.get_device_capability()
-
-        if capability_tuple is not None:
-            capability = capability_tuple.to_int()
-            if capability < quant_config.get_min_capability():
-                raise ValueError(
-                    f"The quantization method {model_config.quantization} "
-                    "is not supported for the current GPU. "
-                    f"Minimum capability: {quant_config.get_min_capability()}. "
-                    f"Current capability: {capability}.")
-        supported_dtypes = quant_config.get_supported_act_dtypes()
-        if model_config.dtype not in supported_dtypes:
-            raise ValueError(
-                f"{model_config.dtype} is not supported for quantization "
-                f"method {model_config.quantization}. Supported dtypes: "
-                f"{supported_dtypes}")
-        return quant_config
-    return None
-
-
 def _get_model_initialization_kwargs(
         model_class: Type[nn.Module],
         lora_config: Optional[LoRAConfig],
@@ -185,7 +159,6 @@ def _initialize_model(vllm_config: VllmConfig) -> nn.Module:
     lora_config = vllm_config.lora_config
     scheduler_config = vllm_config.scheduler_config
     cache_config = vllm_config.cache_config
-    load_config = vllm_config.load_config
     model_class, _ = get_model_architecture(model_config)
 
     return build_model(
@@ -193,7 +166,7 @@ def _initialize_model(vllm_config: VllmConfig) -> nn.Module:
         vllm_config,
         model_config.hf_config,
         cache_config=cache_config,
-        quant_config=_get_quantization_config(model_config, load_config),
+        quant_config=vllm_config.quant_config,
         lora_config=lora_config,
         multimodal_config=model_config.multimodal_config,
         scheduler_config=scheduler_config,
@@ -518,8 +491,7 @@ def _load_model_serialized(
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model_class = get_model_architecture(model_config)[0]
-                quant_config = _get_quantization_config(
-                    model_config, self.load_config)
+                quant_config = vllm_config.quant_config
                 extra_kwargs = _get_model_initialization_kwargs(
                     model_class, lora_config, model_config.multimodal_config)
                 extra_kwargs["quant_config"] = quant_config

From fb2716d64117aaa6c36b97b09765aa10a89e2fe5 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 5 Nov 2024 01:04:40 +0800
Subject: [PATCH 44/85] [Misc]Reduce BNB static variable (#9987)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 40 +++++++++++-----------
 vllm/model_executor/models/falcon.py       |  2 --
 vllm/model_executor/models/gemma.py        |  3 --
 vllm/model_executor/models/gemma2.py       |  2 --
 vllm/model_executor/models/llama.py        |  2 --
 vllm/model_executor/models/minicpmv.py     |  8 -----
 vllm/model_executor/models/mllama.py       |  2 --
 vllm/model_executor/models/opt.py          |  2 --
 vllm/model_executor/models/phi.py          |  2 --
 vllm/model_executor/models/qwen2.py        |  3 --
 10 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 5edb951343ae0..c3e0290f270ae 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -28,7 +28,8 @@
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.linear import (ReplicatedLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.model_loader.tensorizer import (
@@ -727,6 +728,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
 
+        # Save the module names without sharding.
+        self.unsharded_weights_modules: List[str] = []
+        # Save the module names that are sharded by column.
+        self.column_sharded_weights_modules: List[str] = []
         # we don't need to quantize the whole model, only the target modules
         # that are specified in the adapter config file. If the adapter config
         # file is not provided, we will quantize the default modules.
@@ -744,8 +749,6 @@ def __init__(self, load_config: LoadConfig):
         with open(config_file_path, "r") as f:
             config = json.load(f)
             self.target_modules = config["target_modules"]
-        # Save the module names without sharding.
-        self.unsharded_weights_modules: List[str] = []
 
     def _get_config_file(self, qlora_adapter: str) -> str:
         is_local = os.path.isdir(qlora_adapter)
@@ -971,9 +974,9 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
                         for module in self.unsharded_weights_modules):
                     weight_sub_tensor = weight_tensor
                 # Shard by column
-                elif any(module in weight_name
-                         for module in self.column_parallel_weights_modules):
-
+                elif any(
+                        weight_name.startswith(module)
+                        for module in self.column_sharded_weights_modules):
                     total_size = weight_tensor.size(-1)
                     start_index = total_size // tp_size * tp_rank
                     end_index = total_size // tp_size * (tp_rank + 1)
@@ -1028,20 +1031,17 @@ def _load_weights(self, model_config: ModelConfig,
             else:
                 self.target_modules = self.default_target_modules
 
-        if hasattr(model, 'column_parallel_weights_modules'):
-            self.column_parallel_weights_modules = \
-                model.column_parallel_weights_modules
-        else:
-            self.column_parallel_weights_modules = []
-        # Some modules like `ReplicatedLinear` should not have their weights
-        # sharded. The reason for implementing it this way is to avoid new
-        # static variable in the model implementation.
-        # TODO: Can we reduce the static variables needed for BNB based on
-        #  model information?
-        self.unsharded_weights_modules = [
-            name for name, module in model.named_modules()
-            if isinstance(module, (ReplicatedLinear, ))
-        ]
+        for name, module in model.named_modules():
+            # Some modules like `ReplicatedLinear` should not have their weights
+            # sharded. The reason for implementing it this way is to avoid new
+            # static variable in the model implementation.
+            if isinstance(module, (ReplicatedLinear, )):
+                self.unsharded_weights_modules.append(name)
+            # In TP, these weights are partitioned along the column
+            # dimension (dim=-1)
+            elif isinstance(module, (RowParallelLinear, )):
+                self.column_sharded_weights_modules.append(name)
+
         self.model_type = type(model).__name__
 
         logger.info("Loading weights with BitsAndBytes quantization. "
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 36c85e37783ab..c376347811965 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -401,8 +401,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
         ".dense_h_to_4h.",
         ".dense_4h_to_h.",
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".dense_4h_to_h.", ".dense."]
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 57b2b43c82f89..029178af61da0 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -350,7 +350,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "gate_up_proj",
         "down_proj",
     ]
-
     # BitandBytes specific attributes
     default_bitsandbytes_target_modules = [
         ".gate_proj.",
@@ -361,8 +360,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ".v_proj.",
         ".o_proj.",
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 693f32160a289..9238ed839c9de 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -390,8 +390,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ".v_proj.",
         ".o_proj.",
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8a9e5203972be..38a31f420cec9 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -464,8 +464,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ".v_proj.",
         ".o_proj.",
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index c1f714bb25680..f90df6b7df036 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -854,10 +854,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         # resampler
         ".kv_proj.",
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [
-        ".down_proj.", ".o_proj.", ".self_attn.out_proj.", ".fc2."
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -1008,10 +1004,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         # resampler
         ".kv_proj.",
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [
-        ".down_proj.", ".o_proj.", ".self_attn.out_proj.", ".fc2."
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index a03155ac32a61..d30b9addd09f1 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1062,8 +1062,6 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
         # so we can't add a dot in front of it.
         "multi_modal_projector."
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj.", ".fc2."]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 10cca8b56268a..7521ab749e10f 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -343,8 +343,6 @@ class OPTForCausalLM(nn.Module, SupportsPP):
     default_bitsandbytes_target_modules = [
         ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2."
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".out_proj.", ".fc2."]
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 497eae4e8905b..4e7935a7636c5 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -274,8 +274,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     default_bitsandbytes_target_modules = [
         ".q_proj.", ".k_proj.", ".v_proj.", ".fc1.", ".fc2.", ".dense."
     ]
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".fc2.", ".dense."]
 
     embedding_modules = {}
     embedding_padding_modules = []
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index db7556b3b5f4b..72b286fe6f6d6 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -395,9 +395,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ".v_proj.",
         ".o_proj.",
     ]
-
-    # in TP, these weights are partitioned along the column dimension (dim=-1)
-    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

From 603a661ae8ccadd8401284f7db8563164b232651 Mon Sep 17 00:00:00 2001
From: Mor Zusman <mor.zusmann@gmail.com>
Date: Mon, 4 Nov 2024 20:00:00 +0200
Subject: [PATCH 45/85] [Model] factoring out MambaMixer out of Jamba (#8993)

Signed-off-by: mzusman <mor.zusmann@gmail.com>
---
 .../layers/mamba/mamba_mixer.py               | 217 ++++++++++++++++++
 vllm/model_executor/models/jamba.py           | 199 ++--------------
 vllm/model_executor/models/mamba.py           | 203 ++--------------
 3 files changed, 245 insertions(+), 374 deletions(-)
 create mode 100644 vllm/model_executor/layers/mamba/mamba_mixer.py

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
new file mode 100644
index 0000000000000..8ef0a6cdf2c52
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -0,0 +1,217 @@
+import torch
+from torch import nn
+from torch.nn.parameter import Parameter
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.models.mamba_cache import MambaCacheParams
+from vllm.model_executor.utils import set_weight_attrs
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+@CustomOp.register("mamba_mixer")
+class MambaMixer(CustomOp):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(self,
+                 hidden_size: int,
+                 ssm_state_size: int,
+                 conv_kernel_size: int,
+                 intermediate_size: int,
+                 time_step_rank: int,
+                 use_conv_bias: bool,
+                 use_bias: bool,
+                 use_rms_norm: bool,
+                 rms_norm_eps: float = 1e-5,
+                 activation="silu"):
+        super().__init__()
+        self.time_step_rank = time_step_rank
+        self.ssm_state_size = ssm_state_size
+        self.use_rms_norm = use_rms_norm
+        self.activation = activation
+
+        self.conv1d = ColumnParallelLinear(
+            input_size=conv_kernel_size,
+            output_size=intermediate_size,
+            bias=use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(hidden_size,
+                                                  [intermediate_size] * 2,
+                                                  bias=use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = RowParallelLinear(
+            intermediate_size,
+            time_step_rank + ssm_state_size * 2,
+            bias=False,
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(time_step_rank,
+                                            intermediate_size,
+                                            bias=True,
+                                            skip_bias_add=True)
+
+        def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            tp_rank = get_tensor_model_parallel_rank()
+            tp_size = get_tensor_model_parallel_world_size()
+            param.data.copy_(
+                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
+                                         dim=0)[tp_rank])
+
+        def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            weight_loader(param, -torch.exp(loaded_weight.float()))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                intermediate_size // tp_size,
+                ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": weight_loader})
+        set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=use_bias,
+            input_is_parallel=True,
+        )
+
+        self.dt_layernorm = RMSNorm(time_step_rank,
+                                    eps=rms_norm_eps) if use_rms_norm else None
+
+        self.b_layernorm = RMSNorm(ssm_state_size,
+                                   eps=rms_norm_eps) if use_rms_norm else None
+
+        self.c_layernorm = RMSNorm(ssm_state_size,
+                                   eps=rms_norm_eps) if use_rms_norm else None
+
+    def forward_native(self, hidden_states: torch.Tensor,
+                       attn_metadata: AttentionMetadata,
+                       conv_state: torch.Tensor, ssm_state: torch.Tensor):
+        pass
+
+    def forward_cuda(self, hidden_states: torch.Tensor,
+                     attn_metadata: AttentionMetadata,
+                     mamba_cache_params: MambaCacheParams):
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
+        hidden_states, gate = projected_states.chunk(2, dim=-2)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+
+        time_step, B, C = torch.split(
+            ssm_parameters,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1,
+        )
+        if self.use_rms_norm:
+            assert self.dt_layernorm is not None
+            assert self.b_layernorm is not None
+            assert self.c_layernorm is not None
+            time_step = self.dt_layernorm(time_step.contiguous())
+            B = self.b_layernorm(B.contiguous())
+            C = self.c_layernorm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
+                                                                     -1))[0]
+        return contextualized_states
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index fddd39fb8c85b..6f7949c880e61 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -12,26 +12,19 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
-from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import (
-    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
 from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
                                       _get_graph_batch_size)
@@ -41,179 +34,6 @@
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
-# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
-class JambaMambaMixer(nn.Module):
-    """
-    Compute ∆, A, B, C, and D the state space parameters and compute
-    the `contextualized_states`. A, D are input independent
-    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
-    for why A isn't selective) ∆, B, C are input-dependent
-    (this is a key difference between Mamba and the linear time
-    invariant S4, and is why Mamba is called
-    **selective** state spaces)
-    """
-
-    def __init__(self, config: JambaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.ssm_state_size = config.mamba_d_state
-        self.conv_kernel_size = config.mamba_d_conv
-        self.intermediate_size = config.mamba_expand * config.hidden_size
-        self.time_step_rank = config.mamba_dt_rank
-        self.use_conv_bias = config.mamba_conv_bias
-        self.use_bias = config.mamba_proj_bias
-        self.conv1d = ColumnParallelLinear(
-            input_size=self.conv_kernel_size,
-            output_size=self.intermediate_size,
-            bias=self.use_conv_bias,
-        )
-        # unsqueeze to fit conv1d weights shape into the linear weights shape.
-        # Can't do this in `weight_loader` since it already exists in
-        # `ColumnParallelLinear` and `set_weight_attrs`
-        # doesn't allow to override it
-        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
-
-        self.in_proj = MergedColumnParallelLinear(self.hidden_size,
-                                                  [self.intermediate_size] * 2,
-                                                  bias=self.use_bias)
-        # selective projection used to make dt, B and C input dependent
-        self.x_proj = RowParallelLinear(
-            self.intermediate_size,
-            self.time_step_rank + self.ssm_state_size * 2,
-            bias=False,
-        )
-        # time step projection (discretization) -
-        # In the forward we need to apply dt_proj without the bias,
-        # as the bias is added in the selective scan kernel.
-        self.dt_proj = ColumnParallelLinear(self.time_step_rank,
-                                            self.intermediate_size,
-                                            bias=True,
-                                            skip_bias_add=True)
-
-        tp_size = get_tensor_model_parallel_world_size()
-        self.A = nn.Parameter(
-            torch.empty(
-                self.intermediate_size // tp_size,
-                self.ssm_state_size,
-                dtype=torch.float32,
-            ))
-        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
-
-        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
-        a_weight_loader = composed_weight_loader(
-            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
-        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
-
-        self.out_proj = RowParallelLinear(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=self.use_bias,
-            input_is_parallel=True,
-        )
-        self.activation = config.hidden_act
-
-        self.dt_layernorm = RMSNorm(self.time_step_rank,
-                                    eps=config.rms_norm_eps)
-        self.b_layernorm = RMSNorm(self.ssm_state_size,
-                                   eps=config.rms_norm_eps)
-        self.c_layernorm = RMSNorm(self.ssm_state_size,
-                                   eps=config.rms_norm_eps)
-
-    def forward(self, hidden_states: torch.Tensor,
-                attn_metadata: AttentionMetadata,
-                mamba_cache_params: MambaCacheParams):
-
-        # 1. Gated MLP's linear projection
-        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
-        hidden_states, gate = projected_states.chunk(2, dim=-2)
-
-        # 2. Convolution sequence transformation
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
-
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
-            # |---------- N-1 iteration --------|
-            # |---------------- N iteration ---------------------|
-            # |- tokenA -|......................|-- newTokens ---|
-            # |---------- context_len ----------|
-            # |-------------------- seq_len ---------------------|
-            #                                   |-- query_len ---|
-            hidden_states = causal_conv1d_fn(
-                hidden_states,
-                conv_weights,
-                self.conv1d.bias,
-                activation=self.activation,
-                conv_states=mamba_cache_params.conv_state,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                query_start_loc=attn_metadata.query_start_loc)
-        else:
-            hidden_states = causal_conv1d_update(
-                hidden_states.transpose(0, 1),
-                mamba_cache_params.conv_state,
-                conv_weights,
-                self.conv1d.bias,
-                self.activation,
-                conv_state_indices=mamba_cache_params.state_indices_tensor)
-            hidden_states = hidden_states.transpose(0, 1)
-
-        # 3. State Space Model sequence transformation
-        # 3.a. input varying initialization of time_step, B and C
-        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
-
-        time_step, B, C = torch.split(
-            ssm_parameters,
-            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
-            dim=-1,
-        )
-        time_step = self.dt_layernorm(time_step.contiguous())
-        B = self.b_layernorm(B.contiguous())
-        C = self.c_layernorm(C.contiguous())
-
-        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
-        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
-        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
-            self.dt_proj, "bias") else None)
-
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
-            scan_outputs = selective_scan_fn(
-                hidden_states,
-                mamba_cache_params.ssm_state,
-                discrete_time_step,
-                self.A,
-                B.transpose(-2, -1),
-                C.transpose(-2, -1),
-                self.D.float(),
-                gate,
-                time_proj_bias,
-                delta_softplus=True,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                query_start_loc=attn_metadata.query_start_loc)
-        else:
-            scan_outputs = selective_state_update(
-                mamba_cache_params.ssm_state,
-                hidden_states.transpose(0, 1),
-                discrete_time_step.transpose(0, 1),
-                self.A,
-                B,
-                C,
-                self.D,
-                gate.transpose(0, 1),
-                time_proj_bias,
-                dt_softplus=True,
-                state_batch_indices=mamba_cache_params.state_indices_tensor)
-            scan_outputs = scan_outputs.transpose(0, 1)
-
-        # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
-                                                                     -1))[0]
-        return contextualized_states
-
-
 class JambaMoE(nn.Module):
 
     def __init__(self,
@@ -284,9 +104,18 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
-        self.layer_idx = layer_idx
         self.config = config
-        self.mamba = JambaMambaMixer(config)
+        self.mamba = MambaMixer(hidden_size= config.hidden_size,
+                                ssm_state_size = config.mamba_d_state,
+                                conv_kernel_size = config.mamba_d_conv,
+                                intermediate_size = config.mamba_expand *\
+                                                    config.hidden_size,
+                                time_step_rank = config.mamba_dt_rank,
+                                use_conv_bias = config.mamba_conv_bias,
+                                use_bias = config.mamba_proj_bias,
+                                use_rms_norm=True,
+                                rms_norm_eps=config.rms_norm_eps,
+                                activation=config.hidden_act)
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 9f4f391a6682e..ec726dc4ff4fa 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -10,27 +10,19 @@
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
-from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import (
-    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (HasInnerState,
                                                    IsAttentionFree)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
 from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
                                       _get_graph_batch_size)
@@ -38,194 +30,27 @@
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
-# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
-class MambaMixer(nn.Module):
-    """
-    Compute ∆, A, B, C, and D the state space parameters and compute
-    the `contextualized_states`. A, D are input independent
-    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
-    for why A isn't selective) ∆, B, C are input-dependent
-    (this is a key difference between Mamba and the linear time
-    invariant S4, and is why Mamba is called
-    **selective** state spaces)
-    """
-
-    def __init__(self, config: MambaConfig, layer_idx):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.hidden_size = config.hidden_size
-        self.ssm_state_size = config.state_size
-        self.conv_kernel_size = config.conv_kernel
-        self.intermediate_size = config.intermediate_size
-        self.time_step_rank = int(config.time_step_rank)
-        self.is_falcon_mamba = config.model_type == "falcon_mamba"
-        self.conv1d = ColumnParallelLinear(
-            input_size=self.conv_kernel_size,
-            output_size=self.intermediate_size,
-            bias=config.use_conv_bias,
-        )
-        # unsqueeze to fit conv1d weights shape into the linear weights shape.
-        # Can't do this in `weight_loader` since it already exists in
-        # `ColumnParallelLinear` and `set_weight_attrs`
-        # doesn't allow to override it
-        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
-
-        self.in_proj = MergedColumnParallelLinear(self.hidden_size,
-                                                  [self.intermediate_size] * 2,
-                                                  bias=config.use_bias)
-        # selective projection used to make dt, B and C input dependent
-        self.x_proj = RowParallelLinear(
-            self.intermediate_size,
-            self.time_step_rank + self.ssm_state_size * 2,
-            bias=False,
-        )
-        # time step projection (discretization) -
-        # In the forward we need to apply dt_proj without the bias,
-        # as the bias is added in the selective scan kernel.
-        self.dt_proj = ColumnParallelLinear(self.time_step_rank,
-                                            self.intermediate_size,
-                                            bias=True,
-                                            skip_bias_add=True)
-
-        tp_size = get_tensor_model_parallel_world_size()
-        self.A = nn.Parameter(
-            torch.empty(
-                self.intermediate_size // tp_size,
-                self.ssm_state_size,
-                dtype=torch.float32,
-            ))
-        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
-
-        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
-        a_weight_loader = composed_weight_loader(
-            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
-        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
-
-        self.out_proj = RowParallelLinear(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=config.use_bias,
-            input_is_parallel=True,
-        )
-        self.activation = config.hidden_act
-        if self.is_falcon_mamba:
-            self.dt_layernorm = RMSNorm(self.time_step_rank,
-                                        eps=config.mixer_rms_eps)
-            self.b_layernorm = RMSNorm(self.ssm_state_size,
-                                       eps=config.mixer_rms_eps)
-            self.c_layernorm = RMSNorm(self.ssm_state_size,
-                                       eps=config.mixer_rms_eps)
-
-    def forward(self, hidden_states: torch.Tensor,
-                attn_metadata: AttentionMetadata,
-                mamba_cache_params: MambaCacheParams):
-
-        # 1. Gated MLP's linear projection
-        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
-        hidden_states, gate = projected_states.chunk(2, dim=-2)
-
-        # 2. Convolution sequence transformation
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
-
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
-            # |---------- N-1 iteration --------|
-            # |---------------- N iteration ---------------------|
-            # |- tokenA -|......................|-- newTokens ---|
-            # |---------- context_len ----------|
-            # |-------------------- seq_len ---------------------|
-            #                                   |-- query_len ---|
-            hidden_states = causal_conv1d_fn(
-                hidden_states,
-                conv_weights,
-                self.conv1d.bias,
-                activation=self.activation,
-                conv_states=mamba_cache_params.conv_state,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                query_start_loc=attn_metadata.query_start_loc)
-        else:
-            hidden_states = causal_conv1d_update(
-                hidden_states.transpose(0, 1),
-                mamba_cache_params.conv_state,
-                conv_weights,
-                self.conv1d.bias,
-                self.activation,
-                conv_state_indices=mamba_cache_params.state_indices_tensor)
-            hidden_states = hidden_states.transpose(0, 1)
-
-        # 3. State Space Model sequence transformation
-        # 3.a. input varying initialization of time_step, B and C
-        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
-
-        time_step, B, C = torch.split(
-            ssm_parameters,
-            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
-            dim=-1,
-        )
-        # Note that Jamba and FalconMamba normalizes B, C, and time_step here
-        # but Mamba doesn't.
-        if self.is_falcon_mamba:
-            time_step = self.dt_layernorm(time_step.contiguous())
-            B = self.b_layernorm(B.contiguous())
-            C = self.c_layernorm(C.contiguous())
-
-        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
-        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
-        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
-            self.dt_proj, "bias") else None)
-
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
-            scan_outputs = selective_scan_fn(
-                hidden_states,
-                mamba_cache_params.ssm_state,
-                discrete_time_step,
-                self.A,
-                B.transpose(-2, -1),
-                C.transpose(-2, -1),
-                self.D.float(),
-                gate,
-                time_proj_bias,
-                delta_softplus=True,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                query_start_loc=attn_metadata.query_start_loc)
-        else:
-            scan_outputs = selective_state_update(
-                mamba_cache_params.ssm_state,
-                hidden_states.transpose(0, 1),
-                discrete_time_step.transpose(0, 1),
-                self.A,
-                B,
-                C,
-                self.D,
-                gate.transpose(0, 1),
-                time_proj_bias,
-                dt_softplus=True,
-                state_batch_indices=mamba_cache_params.state_indices_tensor)
-            scan_outputs = scan_outputs.transpose(0, 1)
-
-        # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
-                                                                     -1))[0]
-        return contextualized_states
-
-
 class MambaDecoderLayer(nn.Module):
 
     def __init__(self,
                  config: MambaConfig,
-                 layer_idx: int,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
-        self.layer_idx = layer_idx
         self.config = config
         self.is_falcon_mamba = config.model_type == "falcon_mamba"
-        self.mixer = MambaMixer(config, layer_idx)
+        mixer_rms_rps = config.mixer_rms_rps if self.is_falcon_mamba else None
+        self.mamba = MambaMixer(hidden_size=config.hidden_size,
+                                ssm_state_size=config.state_size,
+                                conv_kernel_size=config.conv_kernel,
+                                intermediate_size=config.intermediate_size,
+                                time_step_rank=config.time_step_rank,
+                                use_conv_bias=config.use_conv_bias,
+                                use_bias=config.use_bias,
+                                use_rms_norm=self.is_falcon_mamba,
+                                rms_norm_eps=mixer_rms_rps,
+                                activation=config.hidden_act)
+
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(

From 1c45f4c38576db6a27a52f36af9b693807d862b7 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:34:26 -0500
Subject: [PATCH 46/85] [CI] Basic Integration Test For TPU (#9968)

Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
---
 .buildkite/run-tpu-test.sh                |  2 +-
 tests/entrypoints/openai/test_accuracy.py | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 6989c94d46a89..988d5aef5fb8c 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -12,4 +12,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py
index 63beaaba29a80..a16e95f94171e 100644
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -10,6 +10,8 @@
 import lm_eval
 import pytest
 
+from vllm.platforms import current_platform
+
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
@@ -18,12 +20,21 @@
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.58
-DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
+DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
 MORE_ARGS_LIST = [
+    [],  # Default
     ["--enable-chunked-prefill"],  # Chunked
     ["--num-scheduler-steps", "8"],  # MS
     ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
 ]
+MAX_WAIT_SECONDS = None
+
+if current_platform.is_tpu():
+    MORE_ARGS_LIST = [
+        [],  # Default
+        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
+    ]
+    MAX_WAIT_SECONDS = 600
 
 
 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
@@ -33,7 +44,9 @@ def test_lm_eval_accuracy(more_args):
 
     print(f"Running with: {args}")
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(
+            MODEL_NAME, args,
+            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
         url = f"{remote_server.url_for('v1')}/completions"
 
         model_args = (

From 5208dc7a203b210fa4462332a56c0012ab8b7a89 Mon Sep 17 00:00:00 2001
From: hissu-hyvarinen <hissu.hyvarinen@amd.com>
Date: Mon, 4 Nov 2024 21:37:46 +0200
Subject: [PATCH 47/85] [Bugfix][CI/Build][Hardware][AMD] Shard ID parameters
 in AMD tests running parallel jobs (#9279)

Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>
---
 .buildkite/run-amd-test.sh  | 11 ++++++-----
 tests/lora/test_minicpmv.py |  7 ++++++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 329cc42558da6..860272e71fd84 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -107,11 +107,12 @@ fi
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used   
+  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    #replace shard arguments
-    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
-    commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-    echo "Shard ${GPU} commands:$commands"
+    # assign shard-id for each shard
+    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    echo "Shard ${GPU} commands:$commands_gpu"
     docker run \
         --device /dev/kfd --device /dev/dri \
         --network host \
@@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then
         -e HF_HOME=${HF_MOUNT} \
         --name ${container_name}_${GPU}  \
         ${image_name} \
-        /bin/bash -c "${commands}" \
+        /bin/bash -c "${commands_gpu}" \
         |& while read -r line; do echo ">>Shard $GPU: $line"; done &
     PIDS+=($!)
   done
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index be040060d02b2..2c45ce5141f7d 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -1,8 +1,11 @@
 from typing import List
 
+import pytest
+
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
@@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
         trust_remote_code=True,
         gpu_memory_utilization=0.97  # This model is pretty big for CI gpus
     )
-
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output1[i])

From 6e056bcf0414dfaee4db646f8f36ec961f0c9a33 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 4 Nov 2024 11:47:11 -0800
Subject: [PATCH 48/85] [Doc] Update VLM doc about loading from local files
 (#9999)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/vlm.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 3377502a6db28..112e9db6a41de 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -242,6 +242,10 @@ To consume the server, you can use the OpenAI client like in the example below:
 
 A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
 
+.. tip::
+    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine,
+    and pass the file path as ``url`` in the API request.
+
 .. tip::
     There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
     In fact, you can place image placeholders in the middle of the text by interleaving text and image content.

From 04cef2c6ab0ea47bb1dfa73d3343985499fe1c4b Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:01:43 -0500
Subject: [PATCH 49/85] [Bugfix] Fix `MQLLMEngine` hanging (#9973)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/engine/multiprocessing/client.py | 12 +++++++++--
 vllm/engine/multiprocessing/engine.py | 24 +++++++++++++---------
 vllm/entrypoints/openai/api_server.py | 29 ++++++++++++++++-----------
 3 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 7f1ca621d91c4..882742c2fc61b 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -112,7 +112,11 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
 
         # Stream for each individual request.
         self.output_queues: Dict[str, asyncio.Queue] = {}
-        self.output_loop = asyncio.create_task(self.run_output_handler_loop())
+
+        # Loop to handle output of the LLMEngine periodically.
+        # Started after the MQLLMEngine is ready so that we can
+        # build the Client in an executor to enable clean shutdown.
+        self.output_loop: Optional[asyncio.Task] = None
 
         # Loop to check health of the LLMEngine periodically.
         # Started after the MQLLMEngine is ready.
@@ -247,6 +251,9 @@ async def run_output_handler_loop(self):
     async def setup(self):
         """Setup the client before it starts sending server requests."""
 
+        # Start output_loop
+        self.output_loop = asyncio.create_task(self.run_output_handler_loop())
+
         with self.get_data_socket() as socket:
             # Wait until server is ready.
             response = await self._wait_for_server_rpc(socket)
@@ -265,7 +272,8 @@ def close(self):
         # Cancel background tasks.
         if self.health_loop is not None:
             self.health_loop.cancel()
-        self.output_loop.cancel()
+        if self.output_loop is not None:
+            self.output_loop.cancel()
 
     def _set_errored(self, e: BaseException):
         logger.exception(repr(e))
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index a73b4c825b11c..9dd6fa5b14315 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -349,16 +349,22 @@ def stop_profile(self) -> None:
             self.engine.model_executor._run_workers("stop_profile")
 
 
+def signal_handler(*_) -> None:
+    raise KeyboardInterrupt("MQLLMEngine terminated")
+
+
 def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
-                  ipc_path: str):
+                  ipc_path: str, engine_alive):
+    try:
+        engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
+                                              usage_context=usage_context,
+                                              ipc_path=ipc_path)
 
-    def signal_handler(*_) -> None:
-        # Interrupt server on sigterm
-        raise KeyboardInterrupt("MQLLMEngine terminated")
+        signal.signal(signal.SIGTERM, signal_handler)
 
-    signal.signal(signal.SIGTERM, signal_handler)
+        engine.start()
 
-    engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
-                                          usage_context=usage_context,
-                                          ipc_path=ipc_path)
-    engine.start()
+    except BaseException as e:
+        logger.exception(e)
+        engine_alive.value = False
+        raise e
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 95fd56d916050..bef36ffdbfcd3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -171,39 +171,44 @@ async def build_async_engine_client_from_engine_args(
         # so we need to spawn a new process
         context = multiprocessing.get_context("spawn")
 
+        # The Process can raise an exception during startup, which may
+        # not actually result in an exitcode being reported. As a result
+        # we use a shared variable to communicate the information.
+        engine_alive = multiprocessing.Value('b', True, lock=False)
         engine_process = context.Process(target=run_mp_engine,
                                          args=(engine_args,
                                                UsageContext.OPENAI_API_SERVER,
-                                               ipc_path))
+                                               ipc_path, engine_alive))
         engine_process.start()
         engine_pid = engine_process.pid
-        assert engine_pid is not None, "Engine process failed to start"
+        assert engine_pid is not None, "Engine process failed to start."
         logger.info("Started engine process with PID %d", engine_pid)
 
         # Build RPCClient, which conforms to EngineClient Protocol.
-        # NOTE: Actually, this is not true yet. We still need to support
-        # embedding models via RPC (see TODO above)
         engine_config = engine_args.create_engine_config()
-        mp_engine_client = MQLLMEngineClient(ipc_path, engine_config,
-                                             engine_pid)
-
+        build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
+                               engine_pid)
+        mq_engine_client = await asyncio.get_running_loop().run_in_executor(
+            None, build_client)
         try:
             while True:
                 try:
-                    await mp_engine_client.setup()
+                    await mq_engine_client.setup()
                     break
                 except TimeoutError:
-                    if not engine_process.is_alive():
+                    if (not engine_process.is_alive()
+                            or not engine_alive.value):
                         raise RuntimeError(
-                            "Engine process failed to start") from None
+                            "Engine process failed to start. See stack "
+                            "trace for the root cause.") from None
 
-            yield mp_engine_client  # type: ignore[misc]
+            yield mq_engine_client  # type: ignore[misc]
         finally:
             # Ensure rpc server process was terminated
             engine_process.terminate()
 
             # Close all open connections to the backend
-            mp_engine_client.close()
+            mq_engine_client.close()
 
             # Wait for engine process to join
             engine_process.join(4)

From 9a5664d4a4d212a6ebad79b15b11eb8d3ab2a0b2 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Mon, 4 Nov 2024 14:32:16 -0800
Subject: [PATCH 50/85] [Misc] Refactor benchmark_throughput.py (#9779)

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
Co-authored-by: Linkun Chen <lkchen@github.com>
Co-authored-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 81 ++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 26 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ee41c8ea38382..262b8652e49ff 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -4,7 +4,7 @@
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import List, Optional
 
 import torch
 import uvloop
@@ -15,16 +15,35 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt
+from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+
+
 def sample_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> List[SampleRequest]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -41,7 +60,7 @@ def sample_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: List[SampleRequest] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -60,13 +79,16 @@ def sample_requests(
         if prompt_len > 1024 or prompt_len + output_len > 2048:
             # Prune too long sequences.
             continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append(
+            SampleRequest(prompt=prompt,
+                          prompt_len=prompt_len,
+                          expected_output_len=output_len))
 
     return filtered_dataset
 
 
 def run_vllm(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     n: int,
     engine_args: EngineArgs,
 ) -> float:
@@ -74,17 +96,17 @@ def run_vllm(
     llm = LLM(**dataclasses.asdict(engine_args))
 
     # Add the requests to the engine.
-    prompts: List[str] = []
+    prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
-    for prompt, _, output_len in requests:
-        prompts.append(prompt)
+    for request in requests:
+        prompts.append(TextPrompt(prompt=request.prompt))
         sampling_params.append(
             SamplingParams(
                 n=n,
                 temperature=1.0,
                 top_p=1.0,
                 ignore_eos=True,
-                max_tokens=output_len,
+                max_tokens=request.expected_output_len,
             ))
 
     use_beam_search = False
@@ -94,11 +116,11 @@ def run_vllm(
         llm.generate(prompts, sampling_params, use_tqdm=True)
         end = time.perf_counter()
     else:
-        prompts = [prompt for prompt, _, _ in requests]
+        prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
         output_len = requests[0][2]
-        for prompt, input_len, _output_len in requests:
-            assert _output_len == output_len
+        for request in requests:
+            assert request.expected_output_len == output_len
         start = time.perf_counter()
         llm.beam_search(
             prompts,
@@ -112,7 +134,7 @@ def run_vllm(
 
 
 async def run_vllm_async(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     n: int,
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
@@ -123,17 +145,17 @@ async def run_vllm_async(
             engine_args, disable_frontend_multiprocessing) as llm:
 
         # Add the requests to the engine.
-        prompts: List[str] = []
+        prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
-        for prompt, _, output_len in requests:
-            prompts.append(prompt)
+        for request in requests:
+            prompts.append(TextPrompt(prompt=request.prompt))
             sampling_params.append(
                 SamplingParams(
                     n=n,
                     temperature=1.0,
                     top_p=1.0,
                     ignore_eos=True,
-                    max_tokens=output_len,
+                    max_tokens=request.expected_output_len,
                 ))
 
         generators = []
@@ -149,7 +171,7 @@ async def run_vllm_async(
 
 
 def run_hf(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     model: str,
     tokenizer: PreTrainedTokenizerBase,
     n: int,
@@ -207,14 +229,14 @@ def run_hf(
 
 
 def run_mii(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     model: str,
     tensor_parallel_size: int,
     output_len: int,
 ) -> float:
     from mii import client, serve
     llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [prompt for prompt, _, _ in requests]
+    prompts = [request.prompt for request in requests]
 
     start = time.perf_counter()
     llm.generate(prompts, max_new_tokens=output_len)
@@ -243,8 +265,12 @@ def main(args: argparse.Namespace):
         else:
             raise ValueError(
                 f"Failed to synthesize a prompt with {args.input_len} tokens.")
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=args.input_len,
+                          expected_output_len=args.output_len)
+            for _ in range(args.num_prompts)
+        ]
     else:
         requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
                                    args.output_len)
@@ -270,9 +296,10 @@ def main(args: argparse.Namespace):
                                args.output_len)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
-    total_output_tokens = sum(output_len for _, _, output_len in requests)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
@@ -299,7 +326,9 @@ def main(args: argparse.Namespace):
     parser.add_argument("--dataset",
                         type=str,
                         default=None,
-                        help="Path to the dataset.")
+                        help="Path to the dataset. The dataset is expected to "
+                        "be a json in form of List[Dict[..., conversations: "
+                        "List[Dict[..., value: <prompt_or_response>]]]]")
     parser.add_argument("--input-len",
                         type=int,
                         default=None,

From ac04a97a9fbc122bb14ff4eb590314d453cdf57c Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Tue, 5 Nov 2024 00:53:24 +0200
Subject: [PATCH 51/85] [Frontend] Add max_tokens prometheus metric (#9881)

Signed-off-by: Tomer Asida <tomera@ai21.com>
---
 tests/entrypoints/openai/test_metrics.py | 11 +++++++++--
 tests/metrics/test_metrics.py            |  1 +
 vllm/engine/llm_engine.py                |  4 ++++
 vllm/engine/metrics.py                   |  8 ++++++++
 vllm/engine/metrics_types.py             |  1 +
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index b3f1fea91d13e..6523c8b6297c6 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -70,10 +70,14 @@ async def client(server):
     [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
      ("_count", _NUM_REQUESTS)],
     "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
+    "vllm:request_params_max_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
     "vllm:prompt_tokens": [("_total",
                             _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
-    "vllm:generation_tokens":
-    [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
+    "vllm:generation_tokens": [
+        ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
+    ],
     "vllm:request_success": [("_total", _NUM_REQUESTS)],
 }
 
@@ -149,6 +153,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_params_n_sum",
     "vllm:request_params_n_bucket",
     "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
     "vllm:num_preemptions_total",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 7a361ef320810..4a824c7acef21 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -365,6 +365,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
             "vllm:request_prompt_tokens",
             "vllm:request_generation_tokens",
             "vllm:request_params_n",
+            "vllm:request_params_max_tokens",
         ]
         for metric_name in request_histogram_metrics:
             metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b12d29c4a8503..2c584218485c8 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1685,6 +1685,7 @@ def _get_stats(self,
         num_prompt_tokens_requests: List[int] = []
         num_generation_tokens_requests: List[int] = []
         n_requests: List[int] = []
+        max_tokens_requests: List[int] = []
         finished_reason_requests: List[str] = []
 
         # Lora requests
@@ -1792,6 +1793,8 @@ def _get_stats(self,
                     ])
                     if seq_group.sampling_params is not None:
                         n_requests.append(seq_group.sampling_params.n)
+                        max_tokens_requests.append(
+                            seq_group.sampling_params.max_tokens)
                     finished_reason_requests.extend([
                         SequenceStatus.get_finished_reason(seq.status)
                         for seq in seq_group.get_finished_seqs()
@@ -1847,6 +1850,7 @@ def _get_stats(self,
             num_prompt_tokens_requests=num_prompt_tokens_requests,
             num_generation_tokens_requests=num_generation_tokens_requests,
             n_requests=n_requests,
+            max_tokens_requests=max_tokens_requests,
             finished_reason_requests=finished_reason_requests,
             max_lora=str(max_lora_stat),
             waiting_lora_adapters=list(waiting_lora_adapters.keys()),
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 9ed30e1e99857..3e3357ed74633 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -179,6 +179,12 @@ def __init__(self, labelnames: List[str], max_model_len: int):
             labelnames=labelnames,
             buckets=[1, 2, 5, 10, 20],
         )
+        self.histogram_max_tokens_request = self._histogram_cls(
+            name="vllm:request_params_max_tokens",
+            documentation="Histogram of the max_tokens request parameter.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
         self.counter_request_success = self._counter_cls(
             name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
@@ -547,6 +553,8 @@ def _log_prometheus(self, stats: Stats) -> None:
             self.metrics.histogram_num_generation_tokens_request,
             stats.num_generation_tokens_requests)
         self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
+        self._log_histogram(self.metrics.histogram_max_tokens_request,
+                            stats.max_tokens_requests)
 
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 510dd04bb3e55..25b7a7479672a 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -53,6 +53,7 @@ class Stats:
     num_prompt_tokens_requests: List[int]
     num_generation_tokens_requests: List[int]
     n_requests: List[int]
+    max_tokens_requests: List[int]
     finished_reason_requests: List[str]
     waiting_lora_adapters: List[str]
     running_lora_adapters: List[str]

From d93478b399535d4b31e49d584d323172e6060653 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Mon, 4 Nov 2024 18:11:28 -0500
Subject: [PATCH 52/85] [Bugfix] Upgrade to pytorch 2.5.1 (#10001)

Signed-off-by: Bill Nell <bill@neuralmagic.com>
---
 CMakeLists.txt            | 4 ++--
 pyproject.toml            | 2 +-
 requirements-build.txt    | 2 +-
 requirements-cuda.txt     | 6 +++---
 requirements-openvino.txt | 2 +-
 requirements-test.in      | 2 +-
 requirements-test.txt     | 4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a6a311e97633..943424bc4edfa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/pyproject.toml b/pyproject.toml
index e78f5652f486b..0bbab3cd3fbc3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging",
     "setuptools>=61",
     "setuptools-scm>=8.0",
-    "torch == 2.5.0",
+    "torch == 2.5.1",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements-build.txt b/requirements-build.txt
index 7b16d9778c1a6..fec01caaf25ef 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.0
+torch==2.5.1
 wheel
 jinja2
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 282ab11838bf4..058ab7c1ee9df 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.0
+torch == 2.5.1
 # These must be updated alongside torch
-torchvision == 0.20   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.0
+torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index 7ad0d1e7f704b..95e5914757812 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -1,7 +1,7 @@
 # Common dependencies
 -r requirements-common.txt
 
-torch == 2.5.0 #  should be aligned with "common" vLLM torch version
+torch == 2.5.1 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
 
 optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
diff --git a/requirements-test.in b/requirements-test.in
index 5d44664c082a6..560c005fd6157 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -18,7 +18,7 @@ ray[adag]==2.35
 sentence-transformers # required for embedding
 soundfile # required for audio test
 timm # required for internvl test
-torch==2.5.0
+torch==2.5.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 datamodel_code_generator # required for minicpm3 test
diff --git a/requirements-test.txt b/requirements-test.txt
index 7477b7c3a79cd..518e81021cbcb 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -492,7 +492,7 @@ timm==1.0.11
     # via -r requirements-test.in
 tokenizers==0.20.1
     # via transformers
-torch==2.5.0
+torch==2.5.1
     # via
     #   -r requirements-test.in
     #   accelerate
@@ -503,7 +503,7 @@ torch==2.5.0
     #   tensorizer
     #   timm
     #   torchvision
-torchvision==0.20.0
+torchvision==0.20.1
     # via timm
 tqdm==4.66.6
     # via

From 2094062b4eafe465826e936fbd5cbd8f099d7762 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 4 Nov 2024 15:11:59 -0800
Subject: [PATCH 53/85] [4.5/N] bugfix for quant config in speculative decode
 (#10007)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/spec_decode/spec_decode_worker.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index a402181b13db8..eb3c2e88e668c 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -61,6 +61,10 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
 
     draft_worker_config = copy.deepcopy(vllm_config)
     draft_worker_config.model_config = speculative_config.draft_model_config
+    draft_worker_config.quant_config = VllmConfig._get_quantization_config(
+        draft_worker_config.model_config,
+        vllm_config.load_config,
+    )
     draft_worker_config.parallel_config = speculative_config.draft_parallel_config  # noqa
     # TODO allow draft-model specific load config.
 

From 8f0a9ca890a125f2b0fef49ba042ecf5b37830a8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 4 Nov 2024 18:57:44 -0500
Subject: [PATCH 54/85] [Bugfix] Respect modules_to_not_convert within
 awq_marlin (#9895)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .../layers/quantization/awq_marlin.py         | 35 +++++++++++++------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 95ec12daeeeb5..ea69bee45f8d9 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -9,7 +9,9 @@
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod,
                                                set_weight_attrs)
+from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
@@ -36,13 +38,18 @@ class AWQMarlinConfig(QuantizationConfig):
         8: scalar_types.uint8,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
-                 lm_head_quantized: bool) -> None:
+    def __init__(self,
+                 weight_bits: int,
+                 group_size: int,
+                 zero_point: bool,
+                 lm_head_quantized: bool,
+                 modules_to_not_convert: Optional[List[str]] = None) -> None:
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
-        self.has_zp = has_zp
+        self.zero_point = zero_point
         self.lm_head_quantized = lm_head_quantized
         self.weight_bits = weight_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
 
         if self.weight_bits not in self.TYPE_MAP:
             raise ValueError(f"Unsupported num_bits = {self.weight_bits}. "
@@ -52,13 +59,14 @@ def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
 
         verify_marlin_supported(self.quant_type,
                                 group_size=self.group_size,
-                                has_zp=self.has_zp)
+                                has_zp=self.zero_point)
 
     def __repr__(self) -> str:
         return (f"AWQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
-                f"has_zp={self.has_zp}, "
-                f"lm_head_quantized={self.lm_head_quantized})")
+                f"zero_point={self.zero_point}, "
+                f"lm_head_quantized={self.lm_head_quantized}, "
+                f"modules_to_not_convert={self.modules_to_not_convert})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -80,10 +88,13 @@ def get_config_filenames(cls) -> List[str]:
     def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
-        has_zp = cls.get_from_keys(config, ["zero_point"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
-        return cls(weight_bits, group_size, has_zp, lm_head_quantized)
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None)
+        return cls(weight_bits, group_size, zero_point, lm_head_quantized,
+                   modules_to_not_convert)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -109,6 +120,8 @@ def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         if (isinstance(layer, LinearBase) or
             (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return AWQMoEMethod(self)
@@ -123,7 +136,7 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         quant_method = quant_config.get("quant_method", "").lower()
         num_bits = quant_config.get("bits")
         group_size = quant_config.get("group_size")
-        has_zp = quant_config.get("zero_point")
+        zero_point = quant_config.get("zero_point")
 
         if not current_platform.is_cuda():
             return False
@@ -132,7 +145,7 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
             return False
 
         # If we cannot find the info needed in the config, cannot convert.
-        if (num_bits is None or group_size is None or has_zp is None):
+        if (num_bits is None or group_size is None or zero_point is None):
             return False
 
         if num_bits not in cls.TYPE_MAP:
@@ -140,7 +153,7 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
 
         return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
                                       group_size=group_size,
-                                      has_zp=has_zp)
+                                      has_zp=zero_point)
 
 
 class AWQMarlinLinearMethod(LinearMethodBase):

From 04bbf38e05fe75539577184f6ca776df39e70dcd Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 4 Nov 2024 20:08:21 -0500
Subject: [PATCH 55/85] [Core] Use os.sched_yield in ShmRingBuffer instead of
 time.sleep (#9994)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .../device_communicators/shm_broadcast.py         | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 7d526b25ed193..2ff1a1ead99c1 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,3 +1,4 @@
+import os
 import pickle
 import time
 from contextlib import contextmanager
@@ -18,12 +19,6 @@
 
 VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 
-# time to wait if the queue is full or empty
-# if we sleep for too short, it will consume too much CPU
-# if we sleep for too long, it will slow down the writer/reader
-# 0.1 us is a good balance
-RINGBUFFER_SLEEP_INTERVAL = 1e-7
-
 logger = init_logger(__name__)
 
 
@@ -333,8 +328,8 @@ def acquire_write(self):
                     # if this block is not ready to write,
                     # we need to wait until it is read by all readers
 
-                    # wait for a while
-                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+                    # Release the processor to other threads
+                    os.sched_yield()
 
                     # if we wait for a long time, we should warn the user
                     if (time.monotonic() - start_time >
@@ -387,8 +382,8 @@ def acquire_read(self):
                     # if this block is not ready,
                     # we need to wait until it is written
 
-                    # wait for a while
-                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+                    # Release the processor to other threads
+                    os.sched_yield()
 
                     # if we wait for a long time, we should warn the user
                     if (time.monotonic() - start_time >

From bbc3619dc806b25fd5e14eef90819052ab76e1c6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 5 Nov 2024 10:07:31 +0800
Subject: [PATCH 56/85] [Core] Make encoder-decoder inputs a nested structure
 to be more composable (#9604)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/core/utils.py                           |  57 ++--
 .../output_processor/test_stop_checker.py     |   3 +-
 tests/test_cache_block_hashing.py             |   7 +-
 tests/tokenization/test_detokenize.py         |   6 +-
 vllm/engine/llm_engine.py                     |  51 ++--
 vllm/engine/protocol.py                       |  23 +-
 vllm/inputs/__init__.py                       |  11 +-
 vllm/inputs/data.py                           |  51 ++--
 vllm/inputs/parse.py                          |  15 +-
 vllm/inputs/preprocess.py                     | 269 +++++++++---------
 vllm/inputs/registry.py                       |  14 +-
 vllm/model_executor/models/mllama.py          |  96 +++++--
 vllm/model_executor/models/registry.py        |   5 +
 vllm/sequence.py                              | 113 +++-----
 14 files changed, 372 insertions(+), 349 deletions(-)

diff --git a/tests/core/utils.py b/tests/core/utils.py
index a95a573db7cd3..cd0caa4704e11 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -4,6 +4,7 @@
 from typing import Tuple
 
 from vllm import SamplingParams
+from vllm.inputs import EncoderDecoderInputs, token_inputs
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob, Sequence, SequenceGroup
 
@@ -27,10 +28,7 @@ def create_dummy_prompt(
         prompt_tokens = list(range(prompt_length))
     prompt_str = " ".join([str(t) for t in prompt_tokens])
     prompt = Sequence(int(request_id),
-                      inputs={
-                          "prompt": prompt_str,
-                          "prompt_token_ids": prompt_tokens,
-                      },
+                      inputs=token_inputs(prompt_tokens, prompt=prompt_str),
                       block_size=block_size)
     seq_group = SequenceGroup(request_id=request_id,
                               seqs=[prompt],
@@ -63,23 +61,21 @@ def create_dummy_prompt_encoder_decoder(
     encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
     encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
 
-    inputs = {
-        "prompt": decoder_prompt_str,
-        "prompt_token_ids": decoder_prompt_tokens,
-        "encoder_prompt": encoder_prompt_str,
-        "encoder_prompt_token_ids": encoder_prompt_tokens,
-        "multi_modal_data": None,
+    inputs: EncoderDecoderInputs = {
+        "decoder": token_inputs(decoder_prompt_tokens,
+                                prompt=decoder_prompt_str),
+        "encoder": token_inputs(encoder_prompt_tokens,
+                                prompt=encoder_prompt_str),
     }
 
     decoder_prompt = Sequence(int(request_id),
-                              inputs=inputs,
-                              block_size=block_size,
-                              from_decoder_prompt=True)
+                              inputs=inputs["decoder"],
+                              block_size=block_size)
 
     encoder_prompt = Sequence(int(request_id),
-                              inputs=inputs,
-                              block_size=block_size,
-                              from_decoder_prompt=False)
+                              inputs=inputs["encoder"],
+                              block_size=block_size)
+
     seq_group = SequenceGroup(request_id=request_id,
                               seqs=[decoder_prompt],
                               sampling_params=SamplingParams(best_of=best_of),
@@ -108,7 +104,7 @@ def create_seq_group(
     for seq_id_offset, output_len in enumerate(seq_output_lens):
         seq = Sequence(
             seq_id=seq_id_start + seq_id_offset,
-            inputs={"prompt_token_ids": prompt_token_ids},
+            inputs=token_inputs(prompt_token_ids),
             block_size=16,
         )
 
@@ -143,21 +139,19 @@ def create_seq_group_encoder_decoder(
 
     prompt_token_ids = [0] * seq_prompt_len
 
-    inputs = {
-        "prompt": "",
-        "prompt_token_ids": prompt_token_ids,
-        "encoder_prompt": "",
-        "encoder_prompt_token_ids": prompt_token_ids,
-        "multi_modal_data": None,
+    inputs: EncoderDecoderInputs = {
+        "decoder": token_inputs(prompt_token_ids),
+        "encoder": token_inputs(prompt_token_ids),
     }
 
     seqs = []
     for seq_id_offset, output_len in enumerate(seq_output_lens):
         # Construct decoder input sequences
-        seq = Sequence(seq_id=seq_id_start + seq_id_offset,
-                       inputs=inputs,
-                       block_size=16,
-                       from_decoder_prompt=True)
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            inputs=inputs["decoder"],
+            block_size=16,
+        )
 
         for i in range(output_len):
             seq.append_token_id(
@@ -167,10 +161,11 @@ def create_seq_group_encoder_decoder(
         seqs.append(seq)
 
     # Encoder input sequence
-    encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens),
-                           inputs=inputs,
-                           block_size=16,
-                           from_decoder_prompt=False)
+    encoder_seq = Sequence(
+        seq_id=seq_id_start + len(seq_output_lens),
+        inputs=inputs["encoder"],
+        block_size=16,
+    )
 
     return SequenceGroup(request_id=request_id,
                          seqs=seqs,
diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py
index 0d84443c51f99..cc14e8cbf75df 100644
--- a/tests/engine/output_processor/test_stop_checker.py
+++ b/tests/engine/output_processor/test_stop_checker.py
@@ -4,6 +4,7 @@
 from transformers import PreTrainedTokenizer
 
 from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.inputs import token_inputs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob, Sequence, SequenceStatus
 
@@ -15,7 +16,7 @@ def sequence_with_eos(text: str, eos_token: str,
     """
     seq = Sequence(
         seq_id=0,
-        inputs={"prompt_token_ids": []},
+        inputs=token_inputs([]),
         block_size=16,
         eos_token_id=eos_token_id,
     )
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 3576a4834ebc3..e8f8499aa88ca 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -6,6 +6,7 @@
 
 import pytest
 
+from vllm.inputs import token_inputs
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Sequence
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -70,10 +71,8 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
                 hashes[-1].append([])
                 prompt_token_ids = tokenizer.encode(prompt)
                 seq = Sequence(seq_id,
-                               inputs={
-                                   "prompt": prompt,
-                                   "prompt_token_ids": prompt_token_ids,
-                               },
+                               inputs=token_inputs(prompt_token_ids,
+                                                   prompt=prompt),
                                block_size=block_size,
                                eos_token_id=tokenizer.tokenizer.eos_token_id,
                                lora_request=lora_request)
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 1d07885349409..a3e70a40db979 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -3,6 +3,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from vllm.inputs import token_inputs
 from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
 from vllm.transformers_utils.detokenizer import (Detokenizer,
                                                  detokenize_incrementally)
@@ -169,10 +170,7 @@ def create_sequence(prompt_token_ids=None):
     prompt_token_ids = prompt_token_ids or [1]
     return Sequence(
         seq_id=0,
-        inputs={
-            "prompt": "<s>",
-            "prompt_token_ids": prompt_token_ids,
-        },
+        inputs=token_inputs(prompt_token_ids, prompt="<s>"),
         block_size=16,
     )
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2c584218485c8..a1809b1a9dd26 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -10,7 +10,7 @@
 from typing import Set, Type, Union, cast, overload
 
 import torch
-from typing_extensions import TypeIs, TypeVar
+from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@@ -29,9 +29,9 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
-                         EncoderDecoderInputs, InputRegistry, PromptType,
-                         TokensPrompt)
+from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
+                         PromptType)
+from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
@@ -638,7 +638,7 @@ def _verify_args(self) -> None:
     def _add_processed_request(
         self,
         request_id: str,
-        processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs],
+        processed_inputs: ProcessorInputs,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
         lora_request: Optional[LoRARequest],
@@ -669,18 +669,19 @@ def _add_processed_request(
         seq_id = next(self.seq_counter)
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
-        seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id,
+        if is_encoder_decoder_inputs(processed_inputs):
+            decoder_inputs = processed_inputs["decoder"]
+            encoder_inputs = processed_inputs["encoder"]
+        else:
+            decoder_inputs = processed_inputs
+            encoder_inputs = None
+
+        seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
                        lora_request, prompt_adapter_request)
 
-        encoder_seq = None
-        if 'encoder_prompt_token_ids' in processed_inputs:
-            encoder_seq = Sequence(seq_id,
-                                   processed_inputs,
-                                   block_size,
-                                   eos_token_id,
-                                   lora_request,
-                                   prompt_adapter_request,
-                                   from_decoder_prompt=False)
+        encoder_seq = (None if encoder_inputs is None else Sequence(
+            seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
+            prompt_adapter_request))
 
         # Create a SequenceGroup based on SamplingParams or PoolingParams
         if isinstance(params, SamplingParams):
@@ -874,7 +875,7 @@ def _validate_token_prompt(self, prompt: PromptType,
         # This needs to happen before multimodal input pre-processing, which
         # may add dummy <image> tokens that aren't part of the tokenizer's
         # vocabulary.
-        if self._is_token_prompt(prompt):
+        if is_token_prompt(prompt):
             prompt_ids = prompt["prompt_token_ids"]
             if len(prompt_ids) == 0:
                 # Empty prompt check is handled later
@@ -884,10 +885,6 @@ def _validate_token_prompt(self, prompt: PromptType,
                 raise ValueError(
                     "Token id {} is out of vocabulary".format(max_input_id))
 
-    @staticmethod
-    def _is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]:
-        return isinstance(prompt, dict) and "prompt_token_ids" in prompt
-
     def _create_sequence_group_with_sampling(
         self,
         request_id: str,
@@ -1978,17 +1975,17 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None:
     def is_encoder_decoder_model(self):
         return self.input_preprocessor.is_encoder_decoder_model()
 
-    def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs,
-                                                   EncoderDecoderInputs],
+    def _validate_model_inputs(self, inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest]):
-        if self.model_config.is_multimodal_model:
+        if is_encoder_decoder_inputs(inputs):
             # For encoder-decoder multimodal models, the max_prompt_len
             # restricts the decoder prompt length
-            prompt_ids = inputs.get("prompt_token_ids")
-        elif self.is_encoder_decoder_model():
-            prompt_ids = inputs.get("encoder_prompt_token_ids")
+            prompt_inputs = inputs["decoder" if self.model_config.
+                                   is_multimodal_model else "encoder"]
         else:
-            prompt_ids = inputs.get("prompt_token_ids")
+            prompt_inputs = inputs
+
+        prompt_ids = prompt_inputs.get("prompt_token_ids")
 
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 6a09361c56865..e0b59d94cfdc3 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,11 +1,12 @@
 import asyncio
 from abc import ABC, abstractmethod
-from typing import AsyncGenerator, List, Mapping, Optional, Union
+from typing import AsyncGenerator, List, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
+from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -60,7 +61,7 @@ def generate(
 
     async def beam_search(
         self,
-        prompt: Union[PromptType, List[int]],
+        prompt: PromptType,
         model_config: ModelConfig,
         request_id: str,
         params: BeamSearchParams,
@@ -76,11 +77,19 @@ async def beam_search(
         tokenizer = await self.get_tokenizer()
         input_preprocessor = InputPreprocessor(model_config, tokenizer)
 
-        (prompt_text, prompt_token_ids, multi_modal_data,
-         mm_processor_kwargs) = input_preprocessor._extract_prompt_components(
-             prompt,
-             request_id=request_id,
-         )
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise NotImplementedError
+        else:
+            processed_inputs = input_preprocessor._prompt_to_llm_inputs(
+                prompt,
+                request_id=request_id,
+            )
+
+        prompt_token_ids = processed_inputs["prompt_token_ids"]
+        prompt_text = processed_inputs.get("prompt")
+        multi_modal_data = processed_inputs.get("multi_modal_data")
+        mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
+
         tokenized_length = len(prompt_token_ids)
 
         sort_beams_key = create_sort_beams_key_function(
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index ac7b3ca28b406..68ac50a2c5a16 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,8 +1,8 @@
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
-                   ExplicitEncoderDecoderPrompt, PromptType, SingletonInputs,
-                   SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
-                   build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
-                   token_inputs, zip_enc_dec_prompts)
+                   ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
+                   TokensPrompt, build_explicit_enc_dec_prompt,
+                   to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
 from .registry import DummyData, InputContext, InputRegistry
 
 INPUT_REGISTRY = InputRegistry()
@@ -22,9 +22,10 @@
     "ExplicitEncoderDecoderPrompt",
     "TokenInputs",
     "token_inputs",
-    "SingletonInputs",
     "DecoderOnlyInputs",
     "EncoderDecoderInputs",
+    "ProcessorInputs",
+    "SingletonInputs",
     "build_explicit_enc_dec_prompt",
     "to_enc_dec_tuple_list",
     "zip_enc_dec_prompts",
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index ba393cbcce4eb..46b41f431bec7 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,4 +1,4 @@
-from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List,
+from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal,
                     Optional, Tuple, Union, cast)
 
 from typing_extensions import NotRequired, TypedDict, TypeVar
@@ -122,27 +122,30 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
 
 class TokenInputs(TypedDict):
     """Represents token-based inputs."""
+
+    type: Literal["token"]
+    """The type of inputs."""
+
     prompt_token_ids: List[int]
     """The token IDs of the prompt."""
 
-    prompt: NotRequired[Optional[str]]
+    prompt: NotRequired[str]
     """
     The original prompt text corresponding to the token IDs, if available.
     """
 
-    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
-    multi_modal_placeholders: NotRequired[
-        Optional["MultiModalPlaceholderDict"]]
+    multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
     """
     Placeholder ranges for the multi-modal data.
     """
 
-    mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]]
+    mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -159,7 +162,7 @@ def token_inputs(
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
     """Construct :class:`TokenInputs` from optional values."""
-    inputs = TokenInputs(prompt_token_ids=prompt_token_ids)
+    inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
 
     if prompt is not None:
         inputs["prompt"] = prompt
@@ -173,12 +176,6 @@ def token_inputs(
     return inputs
 
 
-SingletonInputs = TokenInputs
-"""
-A processed :class:`SingletonPrompt` which can be passed to
-:class:`vllm.sequence.Sequence`.
-"""
-
 DecoderOnlyInputs = TokenInputs
 """
 The inputs in :class:`~vllm.LLMEngine` before they are
@@ -187,28 +184,30 @@ def token_inputs(
 """
 
 
-class EncoderDecoderInputs(TokenInputs):
+class EncoderDecoderInputs(TypedDict):
     """
     The inputs in :class:`~vllm.LLMEngine` before they are
     passed to the model executor.
 
     This specifies the required data for encoder-decoder models.
     """
-    encoder_prompt_token_ids: List[int]
-    """The token IDs of the encoder prompt."""
+    encoder: TokenInputs
+    """The inputs for the encoder portion."""
 
-    encoder_prompt: NotRequired[Optional[str]]
-    """
-    The original encoder prompt text corresponding to the token IDs, if
-    available.
-    """
+    decoder: TokenInputs
+    """The inputs for the decoder portion."""
 
-    encoder_multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
-    """
-    Optional multi-modal data to pass to the encoder model,
-    if the model supports it.
-    """
 
+SingletonInputs = TokenInputs
+"""
+A processed :class:`SingletonPrompt` which can be passed to
+:class:`vllm.sequence.Sequence`.
+"""
+
+ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
+"""
+The inputs to :data:`vllm.inputs.InputProcessor`.
+"""
 
 _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
 _T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index e79d2c813bb4f..09f1ff2cb42e9 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -4,9 +4,9 @@
 
 from vllm.utils import is_list_of
 
-from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
-                   ExplicitEncoderDecoderPrompt, PromptType, SingletonPrompt,
-                   TextPrompt, TokensPrompt)
+from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
+                   ProcessorInputs, PromptType, SingletonPrompt, TextPrompt,
+                   TokensPrompt)
 
 
 class ParsedText(TypedDict):
@@ -98,12 +98,15 @@ def parse_singleton_prompt(
     raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
 
 
+def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]:
+    return isinstance(prompt, dict) and "prompt_token_ids" in prompt
+
+
 def is_explicit_encoder_decoder_prompt(
         prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]:
     return isinstance(prompt, dict) and "encoder_prompt" in prompt
 
 
 def is_encoder_decoder_inputs(
-    inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs],
-) -> TypeIs[EncoderDecoderInputs]:
-    return "encoder_prompt_token_ids" in inputs
+        inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]:
+    return "encoder" in inputs and "decoder" in inputs
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 82ce7d392b719..a5c787a56b5a9 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import List, Optional
 
 from typing_extensions import assert_never
 
@@ -10,22 +10,12 @@
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.utils import print_warning_once
 
-from .data import (DecoderOnlyInputs, EncoderDecoderInputs, PromptType,
-                   SingletonPrompt)
+from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
+                   PromptType, SingletonInputs, SingletonPrompt, token_inputs)
 from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
 
-if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalDataDict
-
 logger = init_logger(__name__)
 
-PromptComponents = Tuple[Optional[str], List[int],
-                         Optional["MultiModalDataDict"], Optional[Dict[str,
-                                                                       Any]]]
-DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
-                                Optional["MultiModalDataDict"],
-                                Optional[Dict[str, Any]]]
-
 
 class InputPreprocessor:
 
@@ -115,7 +105,7 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
         "default" decoder prompt be <BOS>.
 
         However, it is possible that in the future
-        other models may have different or more 
+        other models may have different or more
         complex logic for the default decoder prompt.
         This motivates having a special helper method
         for default decoder prompts.
@@ -132,7 +122,6 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
     def _prepare_decoder_input_ids_for_generation(
         self,
         decoder_input_ids: Optional[List[int]],
-        force_bos: bool = True,
     ) -> List[int]:
         """
         Prepares `decoder_input_ids` for generation with encoder-decoder models.
@@ -162,8 +151,8 @@ def _prepare_decoder_input_ids_for_generation(
             # use decoder_start_token_id as decoder_input_ids
             decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
 
-        if force_bos and (len(decoder_input_ids) == 0
-                          or decoder_input_ids[0] != decoder_start_token_id):
+        if (len(decoder_input_ids) == 0
+                or decoder_input_ids[0] != decoder_start_token_id):
             decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
 
         return decoder_input_ids
@@ -209,12 +198,12 @@ async def _tokenize_prompt_async(
                                             prompt=prompt,
                                             lora_request=lora_request)
 
-    def _extract_prompt_components(
+    def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
-    ) -> PromptComponents:
+    ) -> SingletonInputs:
         '''
         Extract the components of any single encoder or decoder input prompt.
 
@@ -241,34 +230,52 @@ def _extract_prompt_components(
                 request_id=request_id,
                 lora_request=lora_request,
             )
-            multi_modal_data = None
-            mm_processor_kwargs = None
-        elif parsed["type"] == "tokens":
-            prompt_text = None
-            prompt_token_ids = parsed["content"]["prompt_token_ids"]
-            multi_modal_data = parsed["content"].get("multi_modal_data")
-            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
-        elif parsed["type"] == "text":
-            prompt_text = parsed["content"]["prompt"]
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+            )
+
+        if parsed["type"] == "tokens":
+            tokens_content = parsed["content"]
+
+            prompt_token_ids = tokens_content["prompt_token_ids"]
+            multi_modal_data = tokens_content.get("multi_modal_data")
+            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
+
+            return token_inputs(
+                prompt_token_ids=prompt_token_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+        if parsed["type"] == "text":
+            text_content = parsed["content"]
+
+            prompt_text = text_content["prompt"]
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 request_id=request_id,
                 lora_request=lora_request,
             )
-            multi_modal_data = parsed["content"].get("multi_modal_data")
-            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
-        else:
-            assert_never(parsed)
+            multi_modal_data = text_content.get("multi_modal_data")
+            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
 
-        return (prompt_text, prompt_token_ids, multi_modal_data,
-                mm_processor_kwargs)
+        assert_never(parsed)
 
-    async def _extract_prompt_components_async(
+    async def _prompt_to_llm_inputs_async(
         self,
         prompt: SingletonPrompt,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
-    ) -> PromptComponents:
+    ) -> SingletonInputs:
         """Async version of :meth:`_extract_prompt_components`."""
         parsed = parse_singleton_prompt(prompt)
 
@@ -279,59 +286,74 @@ async def _extract_prompt_components_async(
                 request_id=request_id,
                 lora_request=lora_request,
             )
-            multi_modal_data = None
-            mm_processor_kwargs = None
-        elif parsed["type"] == "tokens":
-            prompt_text = None
-            prompt_token_ids = parsed["content"]["prompt_token_ids"]
-            multi_modal_data = parsed["content"].get("multi_modal_data")
-            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
-        elif parsed["type"] == "text":
-            prompt_text = parsed["content"]["prompt"]
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+            )
+
+        if parsed["type"] == "tokens":
+            tokens_content = parsed["content"]
+
+            prompt_token_ids = tokens_content["prompt_token_ids"]
+            multi_modal_data = tokens_content.get("multi_modal_data")
+            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
+
+            return token_inputs(
+                prompt_token_ids=prompt_token_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+        if parsed["type"] == "text":
+            text_content = parsed["content"]
+
+            prompt_text = text_content["prompt"]
             prompt_token_ids = await self._tokenize_prompt_async(
                 prompt_text,
                 request_id=request_id,
                 lora_request=lora_request,
             )
-            multi_modal_data = parsed["content"].get("multi_modal_data")
-            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
-        else:
-            assert_never(parsed)
+            multi_modal_data = text_content.get("multi_modal_data")
+            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
 
-        return (prompt_text, prompt_token_ids, multi_modal_data,
-                mm_processor_kwargs)
+        assert_never(parsed)
 
     def _build_enc_dec_llm_inputs(
         self,
-        encoder_comps: PromptComponents,
-        decoder_comps: DecoderPromptComponents,
-        mm_processor_kwargs: Dict[str, Any],
+        encoder_inputs: SingletonInputs,
+        decoder_inputs: Optional[SingletonInputs],
     ) -> EncoderDecoderInputs:
-        encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps
-        decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps
-
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-        # If the feature combo become valid
-        if decoder_mm_data is not None:
-            raise ValueError(
-                "Multi-modality decoder inputs of encoder-decoder models are "
-                "not supported yet")
-
-        # For Multi-Modal models (e.g., mllama), the text input can be
-        # <|image|><|begin_of_text|>hello world. And we should not add
-        # another <|begin_of_text|> to the beginning.
-        decoder_prompt_ids = (self._prepare_decoder_input_ids_for_generation(
-            decoder_prompt_ids,
-            force_bos=(encoder_mm_data is None and decoder_mm_data is None)))
+        if encoder_inputs["type"] == "token":
+            pass
+        else:
+            assert_never(encoder_inputs)
+
+        if decoder_inputs is None:
+            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                None)
+            decoder_inputs = token_inputs(dec_token_ids)
+        elif decoder_inputs["type"] == "token":
+            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                decoder_inputs["prompt_token_ids"])
+            decoder_inputs["prompt_token_ids"] = dec_token_ids
+
+            if "multi_modal_data" in decoder_inputs:
+                raise ValueError("Multi-modal decoder inputs of encoder-"
+                                 "decoder models are not supported yet")
+        else:
+            assert_never(encoder_inputs)
 
         return EncoderDecoderInputs(
-            prompt_token_ids=decoder_prompt_ids,
-            prompt=decoder_prompt,
-            multi_modal_data=decoder_mm_data,
-            mm_processor_kwargs=mm_processor_kwargs,
-            encoder_prompt_token_ids=encoder_prompt_ids,
-            encoder_prompt=encoder_prompt,
-            encoder_multi_modal_data=encoder_mm_data,
+            encoder=encoder_inputs,
+            decoder=decoder_inputs,
         )
 
     def _process_encoder_decoder_prompt(
@@ -341,8 +363,7 @@ def _process_encoder_decoder_prompt(
     ) -> EncoderDecoderInputs:
         '''
         For encoder/decoder models only:
-        Process an input prompt into an
-        :class:`EncoderDecoderInputs` instance.
+        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
 
         There are two types of input prompts:
         singleton prompts which carry only the
@@ -361,7 +382,7 @@ def _process_encoder_decoder_prompt(
         have any possible singleton type; thus this
         method relies on helper functions to obtain
         token ids for the sub-prompts.
-        
+
         Arguments:
 
         * prompt: an input prompt
@@ -372,40 +393,31 @@ def _process_encoder_decoder_prompt(
         * :class:`EncoderDecoderInputs` instance
         '''
 
-        encoder_comps: PromptComponents
-        decoder_comps: DecoderPromptComponents
+        encoder_inputs: SingletonInputs
+        decoder_inputs: Optional[SingletonInputs]
 
         if is_explicit_encoder_decoder_prompt(prompt):
-            encoder_comps = self._extract_prompt_components(
+            encoder_inputs = self._prompt_to_llm_inputs(
                 prompt["encoder_prompt"],
                 request_id=request_id,
             )
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
-                decoder_comps = None, None, None, None
+                decoder_inputs = None
             else:
-                decoder_comps = self._extract_prompt_components(
+                decoder_inputs = self._prompt_to_llm_inputs(
                     decoder_input,
                     request_id=request_id,
                 )
-            # Handle this carefully in case it was directly initialized by user
-            mm_processor_kwargs = prompt.get("mm_processor_kwargs", {})
         else:
-            encoder_comps = self._extract_prompt_components(
+            encoder_inputs = self._prompt_to_llm_inputs(
                 prompt,
                 request_id=request_id,
             )
-            # If there are no decoder components, we assume the
-            # mm_processor_kwargs are in the encoder prompt
-            mm_processor_kwargs = encoder_comps[-1] if encoder_comps[
-                -1] is not None else {}
-            decoder_comps = None, None, None, None
-
-        return self._build_enc_dec_llm_inputs(
-            encoder_comps,
-            decoder_comps,
-            mm_processor_kwargs,
-        )
+
+            decoder_inputs = None
+
+        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
 
     async def _process_encoder_decoder_prompt_async(
         self,
@@ -413,59 +425,50 @@ async def _process_encoder_decoder_prompt_async(
         request_id: str,
     ) -> EncoderDecoderInputs:
         """Async version of :meth:`_process_encoder_decoder_prompt`."""
-        encoder_comps: PromptComponents
-        decoder_comps: DecoderPromptComponents
+        encoder_inputs: SingletonInputs
+        decoder_inputs: Optional[SingletonInputs]
 
         if is_explicit_encoder_decoder_prompt(prompt):
-            encoder_task = self._extract_prompt_components_async(
+            encoder_task = self._prompt_to_llm_inputs_async(
                 prompt["encoder_prompt"],
                 request_id=request_id,
             )
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
-                encoder_comps = await encoder_task
-                decoder_comps = None, None, None, None
+                encoder_inputs = await encoder_task
+                decoder_inputs = None
             else:
-                decoder_task = self._extract_prompt_components_async(
+                decoder_task = self._prompt_to_llm_inputs_async(
                     decoder_input,
                     request_id=request_id,
                 )
 
-                encoder_comps, decoder_comps = await asyncio.gather(
+                encoder_inputs, decoder_inputs = await asyncio.gather(
                     encoder_task, decoder_task)
-            mm_processor_kwargs = prompt["mm_processor_kwargs"]
         else:
-            encoder_comps = await self._extract_prompt_components_async(
+            encoder_inputs = await self._prompt_to_llm_inputs_async(
                 prompt,
                 request_id=request_id,
             )
-            # If there are no decoder components, we assume the
-            # mm_processor_kwargs are in the encoder prompt
-            mm_processor_kwargs = encoder_comps[-1] if encoder_comps[
-                -1] is not None else {}
-            decoder_comps = None, None, None, None
-
-        return self._build_enc_dec_llm_inputs(
-            encoder_comps,
-            decoder_comps,
-            mm_processor_kwargs,
-        )
+
+            decoder_inputs = None
+
+        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
 
     def _build_decoder_only_llm_inputs(
         self,
-        prompt_comps: PromptComponents,
+        prompt_inputs: DecoderOnlyInputs,
         prompt_adapter_request: Optional[PromptAdapterRequest],
     ) -> DecoderOnlyInputs:
-        (prompt, prompt_token_ids, multi_modal_data,
-         mm_processor_kwargs) = prompt_comps
-
-        prompt_token_ids = self._apply_prompt_adapter(
-            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
+        if prompt_inputs["type"] == "token":
+            prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
+                prompt_inputs["prompt_token_ids"],
+                prompt_adapter_request=prompt_adapter_request,
+            )
+        else:
+            assert_never(prompt_inputs)
 
-        return DecoderOnlyInputs(prompt_token_ids=prompt_token_ids,
-                                 prompt=prompt,
-                                 multi_modal_data=multi_modal_data,
-                                 mm_processor_kwargs=mm_processor_kwargs)
+        return prompt_inputs
 
     def _process_decoder_only_prompt(
         self,
@@ -490,7 +493,7 @@ def _process_decoder_only_prompt(
         * :class:`DecoderOnlyInputs` instance
         '''
 
-        prompt_comps = self._extract_prompt_components(
+        prompt_comps = self._prompt_to_llm_inputs(
             prompt,
             request_id=request_id,
             lora_request=lora_request,
@@ -509,7 +512,7 @@ async def _process_decoder_only_prompt_async(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> DecoderOnlyInputs:
         """Async version of :meth:`_process_decoder_only_prompt`."""
-        prompt_comps = await self._extract_prompt_components_async(
+        prompt_comps = await self._prompt_to_llm_inputs_async(
             prompt,
             request_id=request_id,
             lora_request=lora_request,
@@ -526,7 +529,7 @@ def preprocess(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]:
+    ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.is_encoder_decoder_model():
             # Encoder-decoder model requires special mapping of
@@ -554,7 +557,7 @@ async def preprocess_async(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]:
+    ) -> ProcessorInputs:
         """Async version of :meth:`preprocess`."""
         if self.is_encoder_decoder_model():
             # Encoder-decoder model requires special mapping of
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index fbf912a212568..7d7a797be4f60 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -2,7 +2,7 @@
 from collections import UserDict
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
-                    Optional, Protocol, Type)
+                    Optional, Protocol, Type, cast)
 
 from torch import nn
 from transformers import PretrainedConfig
@@ -12,7 +12,7 @@
 from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once,
                         resolve_mm_processor_kwargs)
 
-from .data import DecoderOnlyInputs
+from .data import ProcessorInputs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -109,7 +109,7 @@ def __getitem__(self, key: str) -> int:
             raise KeyError(msg) from exc
 
 
-InputProcessor = Callable[[InputContext, DecoderOnlyInputs], DecoderOnlyInputs]
+InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs]
 """Preprocess the inputs to the model."""
 
 
@@ -254,8 +254,8 @@ def dummy_data_for_profiling(
     def _default_input_processor(
         self,
         ctx: InputContext,
-        inputs: DecoderOnlyInputs,
-    ) -> DecoderOnlyInputs:
+        inputs: ProcessorInputs,
+    ) -> ProcessorInputs:
         """The default input processor is a no-op."""
         return inputs
 
@@ -288,7 +288,7 @@ def _get_model_input_processor(self, model_cls: Type[nn.Module]):
             .get(model_cls, self._default_input_processor)
 
     def process_input(self, model_config: "ModelConfig",
-                      inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
+                      inputs: ProcessorInputs) -> ProcessorInputs:
         """
         Apply an input processor to an instance of model inputs.
 
@@ -308,7 +308,7 @@ def process_input(self, model_config: "ModelConfig",
         # If it's empty, it'll fall back to the default kwarg values
         mm_processor_kwargs = resolve_mm_processor_kwargs(
             model_config.mm_processor_kwargs,
-            inputs.get("mm_processor_kwargs"),
+            cast(Dict[str, Any], inputs.get("mm_processor_kwargs")),
             processor,
         )
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index d30b9addd09f1..251bfc079684e 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -36,8 +36,8 @@
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         EncoderDecoderInputs, InputContext)
+from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs,
+                         InputContext, TokenInputs, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -52,6 +52,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SequenceData
+from vllm.utils import is_list_of
 
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal
@@ -86,41 +87,58 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int:
     return num_images
 
 
-def input_processor_for_mllama(ctx: InputContext,
-                               inputs: Union[DecoderOnlyInputs,
-                                             EncoderDecoderInputs]):
-    # move encoder_prompt to prompt
-    if inputs.get("prompt") is None:
-        inputs["prompt"] = inputs["encoder_prompt"]
-        inputs["prompt_token_ids"] = inputs["encoder_prompt_token_ids"]
+def input_processor_for_mllama(
+    ctx: InputContext,
+    inputs: EncoderDecoderInputs,
+) -> EncoderDecoderInputs:
+    # Example input to processor:
+    # {
+    #     'encoder': {
+    #         'type': 'token',
+    #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
+    #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
+    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+    #     },
+    #     'decoder': {
+    #         'type': 'token',
+    #         'prompt_token_ids': [128000],
+    #     },
+    # }
+
+    # move encoder prompt to decoder
+    dec_inputs = TokenInputs(**inputs["encoder"])
+
+    multi_modal_data = dec_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        # text-only
+        return EncoderDecoderInputs(
+            encoder=token_inputs([]),
+            decoder=dec_inputs,
+        )
 
-    # process multi-modal data
-    multi_modal_data = inputs.get("encoder_multi_modal_data")
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        image_data = [image_data]
 
-    if multi_modal_data is None or "image" not in multi_modal_data \
-        or multi_modal_data["image"] is None:
-        # text-only
-        inputs["encoder_prompt"] = ""
-        inputs["encoder_prompt_token_ids"] = []
-        inputs["encoder_multi_modal_data"] = {}
-        return inputs
+    assert is_list_of(image_data, Image.Image)
 
-    if isinstance(multi_modal_data['image'], Image.Image):
-        multi_modal_data['image'] = [multi_modal_data['image']]
     # Since only the last group of consecutive images
     # are attended by the decoded tokens, we only need to
     # get the number of tiles for those images.
     num_decode_images = _get_num_image_in_last_group(
-        inputs["prompt_token_ids"])
+        dec_inputs["prompt_token_ids"])
+
     hf_config = ctx.model_config.hf_config
+    vision_config = hf_config.vision_config
+
     num_tiles = 0
-    for image in multi_modal_data["image"][::-1]:
+    for image in image_data[::-1]:
         width, height = image.size
-        tile_size = hf_config.vision_config.image_size
+        tile_size = vision_config.image_size
         canvas_height, canvas_width = get_optimal_tiled_canvas(
             image_height=height,
             image_width=width,
-            max_image_tiles=hf_config.vision_config.max_num_tiles,
+            max_image_tiles=vision_config.max_num_tiles,
             tile_size=tile_size,
         )
         num_tiles_height = canvas_height // tile_size
@@ -133,14 +151,34 @@ def input_processor_for_mllama(ctx: InputContext,
     # Set encoder prompt length based on the number of tiles.
     # This tells the block manager to allocate correct number
     # of slots for encoder tokens.
-    assert hf_config.vision_config.image_size % 14 == 0, \
+    assert vision_config.image_size % 14 == 0, \
         "chunk size should be multiple of 14"
-    token_per_chunk = (hf_config.vision_config.image_size // 14)**2 + 1
+    token_per_chunk = (vision_config.image_size // 14)**2 + 1
     num_tokens = num_tiles * token_per_chunk
-    inputs["encoder_prompt"] = MLLAMA_IMAGE_TOKEN * num_tokens
-    inputs["encoder_prompt_token_ids"] = [MLLAMA_IMAGE_TOKEN_ID] * num_tokens
 
-    return inputs
+    # Example output from processor:
+    # {
+    #     'encoder': {
+    #         'type': 'token',
+    #         'prompt_token_ids': [128256, 128256, ..., 128256],
+    #         'prompt': '<|image|><|image|>...<|image|>',
+    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+    #     },
+    #     'decoder': {
+    #         'type': 'token',
+    #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
+    #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
+    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+    #     },
+    # }
+    return EncoderDecoderInputs(
+        encoder=token_inputs(
+            prompt_token_ids=[MLLAMA_IMAGE_TOKEN_ID] * num_tokens,
+            prompt=MLLAMA_IMAGE_TOKEN * num_tokens,
+            multi_modal_data=multi_modal_data,
+        ),
+        decoder=dec_inputs,
+    )
 
 
 def get_max_mllama_image_tokens(ctx: InputContext) -> int:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 3a929f5cb5195..af52fbffba19e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -343,6 +343,11 @@ def register_model(
     def _raise_for_unsupported(self, architectures: List[str]):
         all_supported_archs = self.get_supported_archs()
 
+        if any(arch in all_supported_archs for arch in architectures):
+            raise ValueError(
+                f"Model architectures {architectures} failed "
+                "to be inspected. Please check the logs for more details.")
+
         raise ValueError(
             f"Model architectures {architectures} are not supported for now. "
             f"Supported architectures: {all_supported_archs}")
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 44a9257c9a4c1..7d7ddc7ec4447 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -9,12 +9,12 @@
 from typing import (TYPE_CHECKING, Any, Callable, DefaultDict, Dict, List,
                     Mapping, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Union, cast
+from typing import Set, Tuple, Union
 
 import msgspec
 import torch
+from typing_extensions import assert_never
 
-from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
@@ -379,15 +379,10 @@ def __repr__(self) -> str:
 
 class Sequence:
     """Stores the data, status, and block information of a sequence.
-
-    The sequence is constructed from the :code:`SingletonInputs` instance
-    passed in through the :code:`inputs` constructor argument.
-
-    For encoder/decoder models, SingletonInputs encapsulates both a
-    decoder and encoder prompt, creating an ambiguity about which
-    prompt to construct the sequence from. The `from_decoder_prompt`
-    constructor argument signals whether to construct the Sequence
-    from the SingletonInputs decoder prompt, or encoder prompt.
+    
+    The sequence is constructed from the :data:`DecoderOnlyInputs`
+    (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
+    instance passed in through the :code:`inputs` constructor argument.
 
     Args:
         seq_id: The ID of the sequence.
@@ -397,10 +392,6 @@ class Sequence:
         eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
         lora_request: LoRA request.
         prompt_adapter_request: Prompt Adapter request.
-        from_decoder_prompt: Construct Sequence from SingletonInputs decoder
-                             prompt (True) or encoder prompt (False.) Must be
-                             True for decoder-only model.
-
     """
 
     def __init__(
@@ -411,7 +402,6 @@ def __init__(
         eos_token_id: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        from_decoder_prompt: bool = True,
     ) -> None:
         self.seq_id = seq_id
         self.inputs = inputs
@@ -419,33 +409,6 @@ def __init__(
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
         self.prompt_adapter_request = prompt_adapter_request
-        self.from_decoder_prompt = from_decoder_prompt
-
-        # For decoder-only models, a Sequence is constructed
-        # from an DecoderOnlyInputs instance (the `inputs` arg.)
-        #
-        # For encoder/decoder models the same `inputs`
-        # instance could be utilized to construct either an
-        # encoder sequence or a decoder sequence, because
-        # `DecoderOnlyInputs` has both decoder- and encoder-oriented
-        # member variables (i.e. it encapsulates both an encoder
-        # and a decoder prompt.) The decision of which type of sequence
-        # to generate is determined by the `from_decoder_prompt` argument.
-        #
-        # When constructing a encoder sequence
-        # (`from_decoder_prompt` False) it matters that
-        # the `DecoderOnlyInputs` instance stored in `inputs` is valid
-        # in the sense that its encoder-related member variables are
-        # populated; below, an exception is raised if this is
-        # not the case.
-        #
-        # When constructing a decoder sequence (`from_decoder_prompt` True)
-        # it does not matter whether `inputs` has its encoder-related
-        # member variables populated.
-        if not (from_decoder_prompt or is_encoder_decoder_inputs(inputs)):
-            raise ValueError("Cannot extract encoder input prompt from "
-                             f"invalid input {inputs}; did you forget the "
-                             "encoder input prompt fields?")
 
         self.data = SequenceData.from_seqs(self.prompt_token_ids)
         self.output_logprobs: SampleLogprobs = []
@@ -470,45 +433,57 @@ def n_blocks(self) -> int:
 
     @cached_property
     def prompt(self) -> Optional[str]:
-        # Select decoder or encoder input prompt str, as appropriate
-        prompt_key: str = ("prompt"
-                           if self.from_decoder_prompt else "encoder_prompt")
+        inputs = self.inputs
 
-        return cast(Optional[str], self.inputs.get(prompt_key))
+        if inputs["type"] == "token":
+            return inputs.get("prompt")
+
+        assert_never(inputs)
 
     @cached_property
     def prompt_token_ids(self) -> List[int]:
-        # Select decoder or encoder input prompt token ids, as appropriate
-        prompt_token_ids_key: str = ("prompt_token_ids"
-                                     if self.from_decoder_prompt else
-                                     "encoder_prompt_token_ids")
+        inputs = self.inputs
 
-        # Cache computed prompt token ids
-        return cast(List[int], self.inputs.get(prompt_token_ids_key))
+        if inputs["type"] == "token":
+            return inputs.get("prompt_token_ids", [])
 
-    @property
-    def multi_modal_data(self) -> MultiModalDataDict:
+        assert_never(inputs)
+
+    @cached_property
+    def prompt_embeds(self) -> Optional[torch.Tensor]:
         inputs = self.inputs
 
-        if (inputs.get("multi_modal_data")
-                and inputs.get("encoder_multi_modal_data")):
-            raise ValueError(
-                "Multi-modal data in both encoder and decoder is not supported."
-            )
+        if inputs["type"] == "token":
+            return None
 
-        return cast(
-            MultiModalDataDict,
-            (inputs.get("multi_modal_data")
-             or inputs.get("encoder_multi_modal_data") or {}),
-        )
+        assert_never(inputs)
+
+    @cached_property
+    def multi_modal_data(self) -> "MultiModalDataDict":
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_data", {})
+
+        assert_never(inputs)
+
+    @cached_property
+    def mm_processor_kwargs(self) -> Dict[str, Any]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("mm_processor_kwargs", {})
+
+        assert_never(inputs)
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
-        return self.inputs.get("multi_modal_placeholders") or {}
+        inputs = self.inputs
 
-    @property
-    def mm_processor_kwargs(self) -> Dict[str, Any]:
-        return self.inputs.get("mm_processor_kwargs") or {}
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_placeholders", {})
+
+        assert_never(inputs)
 
     @property
     def lora_int_id(self) -> int:

From ad23318928d40ef7ac969451afa0dc198428c04b Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 4 Nov 2024 22:46:38 -0500
Subject: [PATCH 57/85] [Bugfix] Fixup Mamba (#10004)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/model_executor/models/mamba.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ec726dc4ff4fa..985ba6f3c60c1 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -39,8 +39,8 @@ def __init__(self,
         super().__init__()
         self.config = config
         self.is_falcon_mamba = config.model_type == "falcon_mamba"
-        mixer_rms_rps = config.mixer_rms_rps if self.is_falcon_mamba else None
-        self.mamba = MambaMixer(hidden_size=config.hidden_size,
+        mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None
+        self.mixer = MambaMixer(hidden_size=config.hidden_size,
                                 ssm_state_size=config.state_size,
                                 conv_kernel_size=config.conv_kernel,
                                 intermediate_size=config.intermediate_size,
@@ -48,7 +48,7 @@ def __init__(self,
                                 use_conv_bias=config.use_conv_bias,
                                 use_bias=config.use_bias,
                                 use_rms_norm=self.is_falcon_mamba,
-                                rms_norm_eps=mixer_rms_rps,
+                                rms_norm_eps=mixer_rms_eps,
                                 activation=config.hidden_act)
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
@@ -99,7 +99,6 @@ def __init__(
         for i in range(config.num_hidden_layers):
             decoder_layers.append(
                 MambaDecoderLayer(config,
-                                  layer_idx=i,
                                   cache_config=cache_config,
                                   quant_config=quant_config))
         self.layers = nn.ModuleList(decoder_layers)

From 7a83b1aec06834e58174694042105e365828507a Mon Sep 17 00:00:00 2001
From: Gene Der Su <e870252314@gmail.com>
Date: Tue, 5 Nov 2024 02:04:10 -0800
Subject: [PATCH 58/85] [BugFix] Lazy import ray (#10021)

---
 vllm/engine/multiprocessing/engine.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 9dd6fa5b14315..e1dcb82829d76 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -5,7 +5,6 @@
 
 import cloudpickle
 import zmq
-from ray.exceptions import RayTaskError
 
 from vllm import AsyncEngineArgs, SamplingParams
 # yapf conflicts with isort for this block
@@ -306,11 +305,17 @@ def _health_check(self):
     def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
         """Send List of RequestOutput to RPCClient."""
         if outputs:
-            # RayTaskError might not pickelable here. We need to unpack the
-            # underlying exception as the real exception in the output.
-            if (isinstance(outputs, RPCError)
-                    and isinstance(outputs.exception, RayTaskError)):
-                outputs.exception = outputs.exception.cause
+            try:
+                from ray.exceptions import RayTaskError
+
+                # RayTaskError might not pickelable here. We need to unpack the
+                # underlying exception as the real exception in the output.
+                if (isinstance(outputs, RPCError)
+                        and isinstance(outputs.exception, RayTaskError)):
+                    outputs.exception = outputs.exception.cause
+            except ImportError:
+                pass
+
             output_bytes = pickle.dumps(outputs)
             self.output_socket.send_multipart((output_bytes, ), copy=False)
 

From 93dee88f6b0ff28c2e8b79d638b4e56d58128927 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 5 Nov 2024 18:59:56 +0800
Subject: [PATCH 59/85] [Misc] vllm CLI flags should be ordered for better user
 readability (#10017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/utils.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vllm/utils.py b/vllm/utils.py
index a742ec8d76908..0b75e8761c916 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1148,9 +1148,23 @@ def __call__(self, parser, namespace, values, option_string=None):
                              "Expected 'true' or 'false'.")
 
 
+class SortedHelpFormatter(argparse.HelpFormatter):
+    """SortedHelpFormatter that sorts arguments by their option strings."""
+
+    def add_arguments(self, actions):
+        actions = sorted(actions, key=lambda x: x.option_strings)
+        super(SortedHelpFormatter, self).add_arguments(actions)
+
+
 class FlexibleArgumentParser(argparse.ArgumentParser):
     """ArgumentParser that allows both underscore and dash in names."""
 
+    def __init__(self, *args, **kwargs):
+        # Set the default 'formatter_class' to SortedHelpFormatter
+        if 'formatter_class' not in kwargs:
+            kwargs['formatter_class'] = SortedHelpFormatter
+        super().__init__(*args, **kwargs)
+
     def parse_args(self, args=None, namespace=None):
         if args is None:
             args = sys.argv[1:]

From 5952d811398d3a22f30d72d2d2943787a78f66ea Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 5 Nov 2024 10:50:57 -0500
Subject: [PATCH 60/85] [Frontend] Fix tcp port reservation for api server
 (#10012)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/entrypoints/openai/api_server.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bef36ffdbfcd3..917b347ff1161 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -569,7 +569,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.bind(("", args.port))
+    sock.bind((args.host or "", args.port))
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
 
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
@@ -593,13 +594,14 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
-            fd=sock.fileno(),
             **uvicorn_kwargs,
         )
 
     # NB: Await server shutdown only after the backend context is exited
     await shutdown_task
 
+    sock.close()
+
 
 if __name__ == "__main__":
     # NOTE(simon):

From cd34029e91ad2d38a58d190331a65f9096c0b157 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:48:44 -0800
Subject: [PATCH 61/85] Refactor TPU requirements file and pin build
 dependencies (#10010)

Signed-off-by: Richard Liu <ricliu@google.com>
---
 Dockerfile.tpu                                |  7 ---
 .../getting_started/tpu-installation.rst      | 57 ++-----------------
 requirements-tpu.txt                          | 20 ++++++-
 3 files changed, 23 insertions(+), 61 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index b43442e4c0af1..0a507b6ecdf60 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \
     git \
     ffmpeg libsm6 libxext6 libgl1
 
-# Install the TPU and Pallas dependencies.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
 # Build vLLM.
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -25,7 +19,6 @@ ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index f0c812b941c1f..75ab2b6ba02dc 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -119,27 +119,19 @@ Uninstall the existing `torch` and `torch_xla` packages:
 
     pip uninstall torch torch-xla -y
 
-Install `torch` and `torch_xla`
+Install build dependencies:
 
 .. code-block:: bash
 
-    pip install --pre torch==2.6.0.dev20241028+cpu torchvision==0.20.0.dev20241028+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
-    pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html
+    pip install -r requirements-tpu.txt
+    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
 
-Install JAX and Pallas:
+Run the setup script:
 
 .. code-block:: bash
 
-    pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-    pip install jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-Install other build dependencies:
+   VLLM_TARGET_DEVICE="tpu" python setup.py develop
 
-.. code-block:: bash
-
-    pip install -r requirements-tpu.txt
-    VLLM_TARGET_DEVICE="tpu" python setup.py develop
-    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
 
 Provision Cloud TPUs with GKE 
 -----------------------------
@@ -168,45 +160,6 @@ Run the Docker image with the following command:
     $ # Make sure to add `--privileged --net host --shm-size=16G`.
     $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
 
-
-.. _build_from_source_tpu:
-
-Build from source
------------------
-
-You can also build and install the TPU backend from source.
-
-First, install the dependencies:
-
-.. code-block:: console
-
-    $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.10 -y
-    $ conda activate myenv
-
-    $ # Clean up the existing torch and torch-xla packages.
-    $ pip uninstall torch torch-xla -y
-
-    $ # Install PyTorch and PyTorch XLA.
-    $ export DATE="20241017"
-    $ export TORCH_VERSION="2.6.0"
-    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
-    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
-
-    $ # Install JAX and Pallas.
-    $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-    $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-    $ # Install other build dependencies.
-    $ pip install -r requirements-tpu.txt
-
-
-Next, build vLLM from source. This will only take a few seconds:
-
-.. code-block:: console
-
-    $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
-
 .. note::
 
     Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 4c606cf0a9105..f9a0770804e55 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -2,6 +2,22 @@
 -r requirements-common.txt
 
 # Dependencies for TPU
-# Currently, the TPU backend uses a nightly version of PyTorch XLA.
-# You can install the dependencies in Dockerfile.tpu.
+cmake>=3.26
+ninja
+packaging
+setuptools-scm>=8
+wheel
+jinja2
 ray[default]
+
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-releases/index.html
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+torch==2.6.0.dev20241028+cpu
+torchvision==0.20.0.dev20241028+cpu
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028-cp310-cp310-linux_x86_64.whl
+jaxlib==0.4.32.dev20240829
+jax==0.4.32.dev20240829

From 09d3550372db10f8c75fddd437325a863265fd82 Mon Sep 17 00:00:00 2001
From: "Chenghao (Alan) Yang" <chenghao@uchicago.edu>
Date: Tue, 5 Nov 2024 11:50:50 -0600
Subject: [PATCH 62/85] [Misc] Add logging for CUDA memory (#10027)

Signed-off-by: Chenghao Yang <yangalan1996@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Chenghao Yang <yangalan1996@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/worker/model_runner.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 328dab598f8ef..2447eecf7957d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -48,9 +48,10 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d,
-                        flatten_2d_lists, is_pin_memory_available,
-                        supports_dynamo, weak_ref_tensor)
+from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
+                        async_tensor_h2d, flatten_2d_lists,
+                        is_pin_memory_available, supports_dynamo,
+                        weak_ref_tensor)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
     _add_attn_metadata_broadcastable_dict,
@@ -1383,16 +1384,16 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         per sequence in the batch.
         """
         assert not self.model_config.enforce_eager
-        logger.info("Capturing the model for CUDA graphs. This may lead to "
+        logger.info("Capturing cudagraphs for decoding. This may lead to "
                     "unexpected consequences if the model is not static. To "
                     "run the model in eager mode, set 'enforce_eager=True' or "
                     "use '--enforce-eager' in the CLI.")
-        logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
-                    "If you are running out of memory, consider decreasing "
-                    "`gpu_memory_utilization` or enforcing eager mode. "
-                    "You can also reduce the `max_num_seqs` as needed "
-                    "to decrease memory usage.")
+        logger.info("If out-of-memory error occurs during cudagraph capture,"
+                    " consider decreasing `gpu_memory_utilization` or "
+                    "switching to eager mode. You can also reduce the "
+                    "`max_num_seqs` as needed to decrease memory usage.")
         start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         # Prepare dummy inputs. These will be reused for all batch sizes.
         max_batch_size = self.max_batchsize_to_capture
@@ -1497,9 +1498,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         graph_runner)
 
         end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
         elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
         # This usually takes < 10 seconds.
-        logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
+        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+                    elapsed_time, cuda_graph_size / GiB_bytes)
 
     def _update_inputs_to_capture_for_enc_dec_model(self,
                                                     capture_inputs: Dict[str,

From 731aec5be713a89dccf1d7106290da17621af816 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 5 Nov 2024 13:30:42 -0500
Subject: [PATCH 63/85] [CI/Build] Limit github CI jobs based on files changed
 (#9928)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/actionlint.yml   |  2 ++
 .github/workflows/clang-format.yml | 12 ++++++++++++
 .github/workflows/mypy.yaml        | 10 ++++++++++
 .github/workflows/ruff.yml         | 17 +++++++++++++----
 .github/workflows/yapf.yml         |  9 ++++++++-
 5 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index b80749aaa8fec..5eddf6b7c649b 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -6,12 +6,14 @@ on:
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
   pull_request:
     branches:
       - "main"
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
 
 env:
   LC_ALL: en_US.UTF-8
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 68d60d7365ed1..167c115d8956f 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -6,9 +6,21 @@ on:
   push:
     branches:
       - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
   pull_request:
     branches:
       - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
 
 jobs:
   clang-format:
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 5f1e5f8eeaf7d..18b354948f0cc 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -6,9 +6,19 @@ on:
   push:
     branches:
       - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
   pull_request:
     branches:
       - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
 
 jobs:
   mypy:
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 9cc8a9e914474..197f918765e7d 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -6,16 +6,28 @@ on:
   push:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
   pull_request:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
 
 jobs:
   ruff:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}
@@ -30,9 +42,6 @@ jobs:
       run: |
         echo "::add-matcher::.github/workflows/matchers/ruff.json"
         ruff check --output-format github .
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
     - name: Run isort
       run: |
         isort . --check-only
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 9f06b35c19e32..35579302c5c14 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -6,15 +6,22 @@ on:
   push:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
   pull_request:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
+
 jobs:
   yapf:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}

From a53046b16fd11436eb2b15421079b7c5b353f955 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 5 Nov 2024 13:42:20 -0500
Subject: [PATCH 64/85] [Model] Support quantization of PixtralHFTransformer
 for PixtralHF (#9921)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/activation.py |  30 +++++++
 vllm/model_executor/models/pixtral.py    | 100 ++++++++++++++---------
 2 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 658a3700f33d6..e347ca80ff765 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -299,3 +299,33 @@ def get_act_fn(
         return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
                                 params_dtype)
     return act_fn
+
+
+_ACTIVATION_AND_MUL_REGISTRY = LazyDict({
+    "gelu": lambda: GeluAndMul(),
+    "silu": lambda: SiluAndMul(),
+})
+
+
+def get_act_and_mul_fn(
+    act_fn_name: str,
+    quant_config: Optional[QuantizationConfig] = None,
+    intermediate_size: Optional[int] = None,
+    input_is_parallel: bool = True,
+    params_dtype: Optional[torch.dtype] = None,
+) -> nn.Module:
+    """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
+        raise ValueError(
+            f"Activation function {act_fn_name!r} is not supported.")
+
+    act_fn = _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]
+    if (quant_config is not None
+            and act_fn_name in quant_config.get_scaled_act_names()):
+        if intermediate_size is None:
+            raise ValueError("intermediate_size must be specified for scaled "
+                             "activation functions.")
+        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
+                                params_dtype)
+    return act_fn
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 051454c49bff8..ee9f150b17cfc 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -19,8 +19,11 @@
 from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
-from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -798,20 +801,24 @@ def __init__(
         super().__init__()
 
         assert config.intermediate_size is not None
-        # TODO: Use quant_config and prefix after optimizing this
-        self.gate_proj = nn.Linear(config.hidden_size,
-                                   config.intermediate_size,
-                                   bias=False)
-        self.up_proj = nn.Linear(config.hidden_size,
-                                 config.intermediate_size,
-                                 bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size,
-                                   config.hidden_size,
-                                   bias=False)
-        self.act = get_act_fn(config.hidden_act)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(input_size=config.intermediate_size,
+                                           output_size=config.hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_and_mul = get_act_and_mul_fn(config.hidden_act)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_and_mul(gate_up)
+        x, _ = self.down_proj(x)
+        return x
 
 
 class PixtralHFAttention(nn.Module):
@@ -830,21 +837,21 @@ def __init__(
         self.n_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // config.num_attention_heads
 
-        self.scale = self.head_dim**-0.5
-
-        # TODO: Use quant_config and prefix after optimizing this
-        self.q_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
-        self.k_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
-        self.v_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
-        self.o_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=config.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.n_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
     def forward(
         self,
@@ -854,13 +861,13 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         batch, patches, _ = hidden_states.size()
 
-        q = self.q_proj(hidden_states)
-        k = self.k_proj(hidden_states)
-        v = self.v_proj(hidden_states)
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv_states.chunk(3, dim=-1)
 
         # Transpose q and k to apply HF's Rotary Position Embedding
         q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
         k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch, patches, self.n_heads, self.head_dim)
         cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0)
 
@@ -868,22 +875,21 @@ def forward(
             # Transpose q and k back for attention
             q = q.transpose(1, 2).contiguous()
             k = k.transpose(1, 2).contiguous()
-            v = v.reshape(batch, patches, self.n_heads, self.head_dim)
 
             out = xops.memory_efficient_attention(q,
                                                   k,
                                                   v,
                                                   attn_bias=attention_mask)
         else:
-            v = v.reshape(batch, patches, self.n_heads,
-                          self.head_dim).transpose(1, 2)
+            v = v.transpose(1, 2)
             out = nn.functional.scaled_dot_product_attention(
                 q, k, v, attn_mask=attention_mask)
             out = out.transpose(1, 2)
 
-        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
+        out = out.view(batch, patches, self.n_heads * self.head_dim)
+        attn_output, _ = self.o_proj(out)
 
-        return self.o_proj(out)
+        return attn_output, None
 
 
 class PixtralHFTransformerBlock(nn.Module):
@@ -912,9 +918,9 @@ def forward(
         attention_mask: torch.Tensor,
         position_embeddings: torch.Tensor,
     ) -> torch.Tensor:
-        r = self.attention.forward(self.attention_norm(hidden_states),
-                                   attention_mask=attention_mask,
-                                   position_embeddings=position_embeddings)
+        r, _ = self.attention.forward(self.attention_norm(hidden_states),
+                                      attention_mask=attention_mask,
+                                      position_embeddings=position_embeddings)
         h = hidden_states + r
         r = self.feed_forward.forward(self.ffn_norm(h))
         out = h + r
@@ -1053,10 +1059,24 @@ def forward(
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = []
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
         params_dict = dict(self.named_parameters())
+        layer_count = len(self.transformer.layers)
 
         for name, loaded_weight in weights:
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("transformer.layers"):
+                layer_idx = int(name.split(".")[2])
+                if layer_idx >= layer_count:
+                    continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue

From d2e80332a7cedcfd23ec705b109c5fa3ad94fcc0 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Tue, 5 Nov 2024 11:30:02 -0800
Subject: [PATCH 65/85] [Feature] Update benchmark_throughput.py to support
 image input (#9851)

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
Co-authored-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/README.md               | 11 ++++
 benchmarks/benchmark_throughput.py | 82 +++++++++++++++++++++++-------
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 192d6c4022c83..2aa4a285021f1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -6,3 +6,14 @@ You can download the dataset by running:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
+
+## Downloading the ShareGPT4V dataset
+
+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
+```bash
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
+```
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 262b8652e49ff..159cf055737ce 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -8,6 +8,7 @@
 
 import torch
 import uvloop
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -38,12 +39,33 @@ class SampleRequest:
     multi_modal_data: Optional[MultiModalDataDict] = None
 
 
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[SampleRequest]:
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -52,23 +74,36 @@ def sample_requests(
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
     # Shuffle the dataset.
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for i in range(len(dataset)):
+    for data in dataset:
         if len(filtered_dataset) == num_requests:
             break
 
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
         # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
         prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
         completion_token_ids = tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
@@ -82,7 +117,8 @@ def sample_requests(
         filtered_dataset.append(
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
-                          expected_output_len=output_len))
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data))
 
     return filtered_dataset
 
@@ -99,7 +135,9 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(TextPrompt(prompt=request.prompt))
+        prompts.append(
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
         sampling_params.append(
             SamplingParams(
                 n=n,
@@ -148,7 +186,9 @@ async def run_vllm_async(
         prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
         for request in requests:
-            prompts.append(TextPrompt(prompt=request.prompt))
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
             sampling_params.append(
                 SamplingParams(
                     n=n,
@@ -272,9 +312,10 @@ def main(args: argparse.Namespace):
             for _ in range(args.num_prompts)
         ]
     else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
+        requests = sample_requests(tokenizer, args)
 
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
@@ -300,6 +341,11 @@ def main(args: argparse.Namespace):
                            for request in requests)
     total_output_tokens = sum(request.expected_output_len
                               for request in requests)
+    if is_multi_modal:
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")

From b9c64c0ca79ccdea608f337fbb7e4b0c75fe3aac Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 6 Nov 2024 03:40:08 +0800
Subject: [PATCH 66/85] [Misc] Modify BNB parameter name (#9997)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../layers/quantization/bitsandbytes.py            |  9 +++++----
 vllm/model_executor/layers/resampler.py            |  2 +-
 vllm/model_executor/model_loader/loader.py         | 14 +++++---------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 718967a065192..78965d7b9495c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -203,8 +203,9 @@ def create_qweight_for_4bit():
             qweight = create_qweight_for_8bit()
         else:
             qweight = create_qweight_for_4bit()
-
-        layer.register_parameter("qweight", qweight)
+        # Enable parameters to have the same name as in the BNB
+        # checkpoint format.
+        layer.register_parameter("weight", qweight)
         set_weight_attrs(qweight, extra_weight_attrs)
 
     def apply(self,
@@ -234,7 +235,7 @@ def _apply_8bit_weight(
             reshape_after_matmul = True
         bf_x = x.to(torch.bfloat16)
 
-        qweight = layer.qweight
+        qweight = layer.weight
         offsets = qweight.bnb_shard_offsets
         quant_states = qweight.bnb_quant_state
         matmul_states = qweight.matmul_state
@@ -313,7 +314,7 @@ def _apply_4bit_weight(
             reshape_after_matmul = True
         bf_x = x.to(torch.bfloat16)
 
-        qweight = layer.qweight
+        qweight = layer.weight
         quant_states = qweight.bnb_quant_state
         offsets = qweight.bnb_shard_offsets
 
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index bce91f1d7fd5e..bca44d2bf2e28 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -177,7 +177,7 @@ def __init__(self,
                                             embed_dim,
                                             bias=False,
                                             quant_config=quant_config,
-                                            prefix=prefix)
+                                            prefix=f"{prefix}.kv_proj")
         else:
             # Maintain the same return value with ReplicatedLinear.forward
             self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index c3e0290f270ae..1f8d531198324 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -892,7 +892,7 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
             if not weight_name.lower().endswith(".scb"):
                 continue
 
-            weight_key = weight_name.lower().replace(".scb", ".qweight")
+            weight_key = weight_name.lower().replace(".scb", ".weight")
             quant_state_dict[weight_key] = weight_tensor
 
         for weight_name, weight_tensor in self._hf_weight_iter(
@@ -901,11 +901,9 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
             if self._is_8bit_weight_name(weight_name):
                 continue
 
-            qweight_name = weight_name.replace(".weight", ".qweight")
-
-            if qweight_name in quant_state_dict:
+            if weight_name in quant_state_dict:
                 set_weight_attrs(weight_tensor, {"load_in_8bit": True})
-                yield qweight_name, weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor
 
@@ -950,9 +948,8 @@ def _parse_quant_state(param_name: str,
             (f"{weight_name}.quant_state.bitsandbytes__fp4" \
                     in temp_state_dict):
                 quant_state = _parse_quant_state(weight_name, temp_state_dict)
-                weight_name = weight_name.replace(".weight", ".qweight")
                 quant_state_dict[weight_name] = quant_state
-                yield weight_name.replace(".weight", ".qweight"), weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor
 
@@ -967,7 +964,6 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
 
             if any(target_module in weight_name for target_module in
                    self.target_modules) and weight_name.endswith(".weight"):
-                weight_name = weight_name.replace(".weight", ".qweight")
                 # Without sharding
                 if any(
                         weight_name.startswith(module)
@@ -1093,7 +1089,7 @@ def _load_weights(self, model_config: ModelConfig,
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
                 # from being incorrectly identified as being present in
-                # 'vpm.encoder.layers.0.self_attn.qkv_proj.qweight
+                # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                 if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".":
                     shard_index = index
                     quant_param_name = quant_param_name.replace(

From 02462465ea1c45163fde632fb94e0e4939ee8a59 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 5 Nov 2024 16:02:23 -0500
Subject: [PATCH 67/85] [CI] Prune tests/models/decoder_only/language/* tests
 (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml                 |  3 +-
 .../decoder_only/language/test_big_models.py  | 93 -------------------
 .../models/decoder_only/language/test_fp8.py  | 10 +-
 .../decoder_only/language/test_gptq_marlin.py | 13 ---
 .../language/test_gptq_marlin_24.py           | 12 +--
 .../decoder_only/language/test_marlin.py      | 69 --------------
 .../decoder_only/language/test_mistral.py     | 37 ++++----
 .../decoder_only/language/test_models.py      | 69 +++++++-------
 .../models/decoder_only/language/test_qwen.py | 34 -------
 9 files changed, 70 insertions(+), 270 deletions(-)
 delete mode 100644 tests/models/decoder_only/language/test_big_models.py
 delete mode 100644 tests/models/decoder_only/language/test_marlin.py
 delete mode 100644 tests/models/decoder_only/language/test_qwen.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9444dc43ea97e..1eb749f64d36b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -321,7 +321,6 @@ steps:
   - tests/models/decoder_only/language
   commands:
     - pytest -v -s models/decoder_only/language/test_models.py
-    - pytest -v -s models/decoder_only/language/test_big_models.py
 
 - label: Decoder-only Language Models Test (Extended) # 1h20min
   nightly: true
@@ -329,7 +328,7 @@ steps:
   - vllm/
   - tests/models/decoder_only/language
   commands:
-    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
+    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
 
 - label: Decoder-only Multi-Modal Models Test (Standard)
   #mirror_hardwares: [amd]
diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py
deleted file mode 100644
index fcfc159e4f5a0..0000000000000
--- a/tests/models/decoder_only/language/test_big_models.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-This tests bigger models and use half precision.
-
-Run `pytest tests/models/test_big_models.py`.
-"""
-import pytest
-
-from vllm.platforms import current_platform
-
-from ...utils import check_logprobs_close, check_outputs_equal
-
-MODELS = [
-    "meta-llama/Llama-2-7b-hf",
-    # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
-    # "Deci/DeciLM-7b",  # Broken
-    # "tiiuae/falcon-7b",  # Broken
-    "EleutherAI/gpt-j-6b",
-    # "mosaicml/mpt-7b",  # Broken
-    # "Qwen/Qwen1.5-0.5B"  # Broken,
-]
-
-if not current_platform.is_cpu():
-    MODELS += [
-        # fused_moe which not supported on CPU
-        "openbmb/MiniCPM3-4B",
-        # Head size isn't supported on CPU
-        "h2oai/h2o-danube3-4b-base",
-    ]
-
-# TODO: remove this after CPU float16 support ready
-target_dtype = "float" if current_platform.is_cpu() else "half"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [32])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-
-    if model == "openbmb/MiniCPM3-4B":
-        # the output becomes slightly different when upgrading to
-        # pytorch 2.5 . Changing to logprobs checks instead of exact
-        # output checks.
-        NUM_LOG_PROBS = 8
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index 5a947ce62c785..f874bf6c73142 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -21,11 +21,11 @@
     "kv_cache_dtype,base_model,test_model,scale_path",
     [
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
         # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct", None),
         # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
         ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
          "meta-llama/Llama-2-7b-chat-hf",
@@ -33,7 +33,7 @@
     ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py
index 2155e83dbe915..a896f145c11f1 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -22,24 +22,11 @@
 MAX_MODEL_LEN = 1024
 
 MODELS = [
-    # act_order==False, group_size=channelwise
-    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
-    # act_order==False, group_size=128
-    ("TheBloke/Llama-2-7B-GPTQ", "main"),
-
     # act_order==True, group_size=128
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-    # act_order==True, group_size=64
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
-    # act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
 
     # 8-bit, act_order==True, group_size=channelwise
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-    # 8-bit, act_order==True, group_size=128
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
-    # 8-bit, act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
 
     # 4-bit, act_order==True, group_size=128
     ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py
index d65be05f141b4..aa63f9f36a3a8 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -25,16 +25,16 @@ class ModelPair:
     # 4-bit, group_size == 128
     ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
               model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
-    # 4-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+    # # 4-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
 
     # 8-bit, group_size == 128
     ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
               model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
-    # 8-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+    # # 8-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
 ]
 
 
diff --git a/tests/models/decoder_only/language/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py
deleted file mode 100644
index c802346dee8af..0000000000000
--- a/tests/models/decoder_only/language/test_marlin.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Compare the outputs of a GPTQ model to a Marlin model.
-
-Note: GPTQ and Marlin do not have bitwise correctness.
-As a result, in this test, we just confirm that the top selected tokens of the
-Marlin/GPTQ models are in the top 3 selections of each other.
-
-Note: Marlin internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for Marlin. As a result, we re-run the test
-up to 3 times to see if we pass.
-
-Run `pytest tests/models/test_marlin.py`.
-"""
-from dataclasses import dataclass
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-
-from ...utils import check_logprobs_close
-
-
-@dataclass
-class ModelPair:
-    model_marlin: str
-    model_gptq: str
-
-
-model_pairs = [
-    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
-              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
-    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
-              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
-    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
-              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
-]
-
-
-@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("marlin"),
-                    reason="Marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("model_pair", model_pairs)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model_pair: ModelPair,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with vllm_runner(model_pair.model_marlin,
-                     dtype=dtype,
-                     quantization="marlin") as marlin_model:
-        marlin_outputs = marlin_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
-        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    check_logprobs_close(
-        outputs_0_lst=gptq_outputs,
-        outputs_1_lst=marlin_outputs,
-        name_0="gptq",
-        name_1="marlin",
-    )
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 174b905d9cbb9..5be44c54a717c 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -4,7 +4,7 @@
 """
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 
 from ...utils import check_logprobs_close
 
@@ -15,6 +15,10 @@
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
+MISTRAL_FORMAT_MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
@@ -95,7 +99,7 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
@@ -135,28 +139,29 @@ def test_mistral_format(
     )
 
 
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
 def test_mistral_symbolic_languages(
+    vllm_runner,
     model: str,
     dtype: str,
-    prompt: str,
 ) -> None:
-    prompt = "hi"
-    msg = {"role": "user", "content": prompt}
-    llm = LLM(model=model,
-              dtype=dtype,
-              max_model_len=8192,
-              tokenizer_mode="mistral",
-              config_format="mistral",
-              load_format="mistral")
-    outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
-    assert "�" not in outputs[0].outputs[0].text.strip()
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=8192,
+                     tokenizer_mode="mistral",
+                     config_format="mistral",
+                     load_format="mistral") as vllm_model:
+        for prompt in SYMBOLIC_LANG_PROMPTS:
+            msg = {"role": "user", "content": prompt}
+            outputs = vllm_model.model.chat([msg],
+                                            sampling_params=SAMPLING_PARAMS)
+            assert "�" not in outputs[0].outputs[0].text.strip()
 
 
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model", MODELS[1:])  # v1 can't do func calling
+@pytest.mark.parametrize("model",
+                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(
     vllm_runner,
     model: str,
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 68055cbe29095..05117666f8c3f 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -7,25 +7,39 @@
 """
 import pytest
 
-from ...utils import check_outputs_equal
+from vllm.platforms import current_platform
+
+from ...utils import check_logprobs_close
 
 MODELS = [
-    "facebook/opt-125m",
-    "gpt2",
-    "bigcode/tiny_starcoder_py",
-    "EleutherAI/pythia-70m",
-    "bigscience/bloom-560m",  # Testing alibi slopes.
-    "microsoft/phi-2",
-    "stabilityai/stablelm-3b-4e1t",
-    # "allenai/OLMo-1B",  # Broken
-    "bigcode/starcoder2-3b",
-    "google/gemma-1.1-2b-it",
+    "facebook/opt-125m",  # opt
+    "openai-community/gpt2",  # gpt2
+    # "Milos/slovak-gpt-j-405M",  # gptj
+    # "bigcode/tiny_starcoder_py",  # gpt_bigcode
+    # "EleutherAI/pythia-70m",  # gpt_neox
+    "bigscience/bloom-560m",  # bloom - testing alibi slopes
+    "microsoft/phi-2",  # phi
+    # "stabilityai/stablelm-3b-4e1t",  # stablelm
+    # "bigcode/starcoder2-3b",  # starcoder2
+    "google/gemma-1.1-2b-it",  # gemma
+    "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
+    "meta-llama/Llama-3.2-1B-Instruct",  # llama
 ]
 
+if not current_platform.is_cpu():
+    MODELS += [
+        # fused_moe which not supported on CPU
+        "openbmb/MiniCPM3-4B",
+    ]
+
+# TODO: remove this after CPU float16 support ready
+target_dtype = "float" if current_platform.is_cpu() else "half"
+
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
     hf_runner,
     vllm_runner,
@@ -33,33 +47,24 @@ def test_models(
     model: str,
     dtype: str,
     max_tokens: int,
+    num_logprobs: int,
 ) -> None:
-    # To pass the small model tests, we need full precision.
-    assert dtype == "float"
 
     with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
 
-    check_outputs_equal(
+    check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
         name_0="hf",
         name_1="vllm",
     )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py
deleted file mode 100644
index 128fe65afbb84..0000000000000
--- a/tests/models/decoder_only/language/test_qwen.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""Ensure that a text-only Qwen model can be run without throwing an error.
-We explicitly test this because Qwen is implemented as a multimodal and
-supports a visual encoder for models like Qwen-VL.
-"""
-from typing import List, Type
-
-import pytest
-
-from ....conftest import VllmRunner
-
-models = [
-    "Qwen/Qwen-7B-Chat"  # Has no visual encoder
-]
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_text_only_qwen_model_can_be_loaded_and_run(
-    vllm_runner: Type[VllmRunner],
-    example_prompts: List[str],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens,
-            num_logprobs=num_logprobs,
-        )

From 235366fe2eb3144321978e181af94487f0215595 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 5 Nov 2024 16:02:32 -0500
Subject: [PATCH 68/85] [CI] Prune back the number of tests in tests/kernels/*
 (#9932)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/kernels/test_activation.py            |  2 +-
 tests/kernels/test_attention.py             |  2 +-
 tests/kernels/test_awq_marlin.py            | 16 ++++++-----
 tests/kernels/test_blocksparse_attention.py |  6 ++---
 tests/kernels/test_cache.py                 |  2 +-
 tests/kernels/test_cutlass.py               | 30 ++++++++++++++++-----
 tests/kernels/test_int8_quant.py            |  7 +++--
 tests/kernels/test_marlin_gemm.py           |  2 +-
 tests/kernels/test_moe.py                   | 23 +++++++++-------
 tests/kernels/test_pos_encoding.py          |  6 ++---
 10 files changed, 60 insertions(+), 36 deletions(-)

diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index 057a11746014c..a84501f9c303f 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -14,7 +14,7 @@
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
-D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
+D = [512, 13824]  # Arbitrary values for testing
 SEEDS = [0]
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 4ecd0fc1a21ad..3e3c0668198ad 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -33,7 +33,7 @@
 
 # FlashAttention forward only supports head dimension at most 128
 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 120, 256]
 
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py
index 59917dd2c58ad..238d6426bf099 100644
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
@@ -14,13 +14,17 @@
     awq_marlin_quantize)
 from vllm.scalar_type import scalar_types
 
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
+GROUP_SIZES = [-1, 32, 128]
 
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 1024, 512])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("n", [128, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("group_size", GROUP_SIZES)
 @pytest.mark.skipif(not (ops.supports_moe_ops
                          and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
                     reason="Marlin is not supported on this GPU type.")
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index fb601852dd523..fad342d1b5923 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -25,10 +25,10 @@
 DTYPES = [torch.half, torch.bfloat16]
 NUM_GEN_SEQS = [3]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
-NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+NUM_HEADS = [(40, 40)]  # Arbitrary values for testing
 
 HEAD_SIZES = [64, 112]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
@@ -37,7 +37,7 @@
 BLOCKSPARSE_VERT_STRIDES = [8]
 
 BLOCKSPARSE_BLOCK_SIZES = [64]
-BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1]
+BLOCKSPARSE_HEADS_SLIDINGS = [2, -1]
 BLOCKSPARSE_HOMO_HEADS = [True, False]
 
 
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index e2b4778b94b9e..40550ed51e2c7 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -13,7 +13,7 @@
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [8, 16, 32]
 
 # Arbitrary values for testing
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 993e67e827ea0..afe53797322f9 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -11,6 +11,28 @@
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 496),
+    (16, 256, 496),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 496),
+    (64, 16384, 1024),
+    (100, 8192, 496),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
+
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
@@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int,
             (out, a, b, scale_a, scale_b, bias))
 
 
-@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
-@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
-@pytest.mark.parametrize("k", [128, 496, 1024])
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
@@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
     cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
-@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024])
-@pytest.mark.parametrize("k", [128, 496, 1024])
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 8db6a0d0d9fa4..12c578db0893c 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -7,11 +7,10 @@
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
-                8193]  # Arbitrary values for testing
+HIDDEN_SIZES = [16, 67, 768, 5137, 8193]  # Arbitrary values for testing
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SEEDS = [0]
-SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
+SCALE = [0.1, 2.1]
 
 
 def opcheck_int8_quant_static(output, input, scale, azp=None):
@@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("scale", SCALE[2:])  # Reduce test time
+@pytest.mark.parametrize("scale", SCALE)
 @pytest.mark.parametrize("azp", [-255, 54])
 @torch.inference_mode()
 def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 5cfd4d6da7a86..b6dd68cc51a9f 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -35,7 +35,7 @@
 USE_FP32_REDUCE_OPTS = [False, True]
 
 MARLIN_K_CHUNKS = [128]
-MARLIN_N_CHUNKS = [64, 128, 256]
+MARLIN_N_CHUNKS = [64, 256]
 
 MARLIN_24_K_CHUNKS = [128]
 MARLIN_24_N_CHUNKS = [512]
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 19c3fc1e1fe3a..17428ebfc2e28 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -20,12 +20,15 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
 
-@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_fused_moe(
     m: int,
@@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype):
                                atol=mixtral_moe_tol[dtype])
 
 
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 1024, 512])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("n", [128, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
 @pytest.mark.parametrize("num_bits", [4, 8])
 @pytest.mark.parametrize("is_k_full", [True, False])
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index b408559cc0b07..eee77c22ab81a 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -11,10 +11,10 @@
 
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 112, 120, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
-NUM_HEADS = [7, 17]  # Arbitrary values for testing
-BATCH_SIZES = [1, 5]  # Arbitrary values for testing
+NUM_HEADS = [17]  # Arbitrary values for testing
+BATCH_SIZES = [5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
 CUDA_DEVICES = [

From ca9844b340f45f23f8d30fdce23777d215ad987c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 5 Nov 2024 14:49:20 -0800
Subject: [PATCH 69/85] [bugfix] fix weak ref in piecewise cudagraph and
 tractable test (#10048)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_toy_llama.py | 111 ++++++++++++++++++++--
 vllm/compilation/backends.py              |  82 +++++++++++++---
 2 files changed, 168 insertions(+), 25 deletions(-)

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index e3e5a7d0fc5a5..9c65059c6b348 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -1,6 +1,10 @@
 """
 Test the piecewise compilation with a simple model, comparing the output
 with and without the piecewise compilation.
+
+This is a tractable model, the weights and computation are specially designed
+if the config `tractable_init` is set to True. Otherwise, the weights are
+initialized randomly with a fixed seed.
 """
 import os
 from dataclasses import dataclass
@@ -49,6 +53,12 @@ class LlamaConfig:
     mlp_size: int = 256
     vocab_size: int = 128
     num_layers: int = 2
+    init_value: float = 1.0
+    tractable_init: bool = False
+    random_seed: int = 0
+
+    def __post_init__(self):
+        assert self.mlp_size >= self.hidden_size
 
 
 class LlamaMLP(nn.Module):
@@ -66,10 +76,23 @@ def __init__(self, config: LlamaConfig) -> None:
             bias=False,
         )
 
-        self.gate_up_projection.weight.data.fill_(0.0)
-        self.down_projection.weight.data.fill_(0.0)
+        if config.tractable_init:
+            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
+            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
+            nn.init.eye_(self.down_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.down_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
 
     def forward(self, x):
+        # for tractable_init and positive input, this is
+        # essentially an elementwise-square
         x = self.gate_up_projection(x)
         x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
             x[:, x.size(1) // 2:])
@@ -84,21 +107,39 @@ def __init__(self, config: LlamaConfig) -> None:
         self.qkv_projection = nn.Linear(
             in_features=config.hidden_size,
             out_features=config.hidden_size * 3,
+            bias=False,
         )
 
         self.output_projection = nn.Linear(
             in_features=config.hidden_size,
             out_features=config.hidden_size,
+            bias=False,
         )
 
-        self.qkv_projection.weight.data.fill_(0.0)
-        self.output_projection.weight.data.fill_(0.0)
+        if config.tractable_init:
+            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
+                                                         config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[2 *
+                                                         config.hidden_size:])
+            nn.init.eye_(self.output_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.qkv_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.output_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
+        # for tractable_init, this is:
+        # output = (hidden_states * 3 + positions * 2)
         qkv = self.qkv_projection(hidden_states)
         hidden_size = qkv.size(-1) // 3
         q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
@@ -126,20 +167,29 @@ def forward(
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        For tractable computation:
+        - if residual is None, the outputs are:
+            - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        - if residual is not None, the outputs are:
+            - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        """ # noqa
         if residual is None:
             residual = hidden_states
-            hidden_states = hidden_states / 2
+            hidden_states = hidden_states + 1
         else:
             hidden_states = hidden_states + residual
             residual = hidden_states
-            hidden_states = hidden_states / 2
+            hidden_states = hidden_states + 1
 
         hidden_states = self.self_attention(positions=positions,
                                             hidden_states=hidden_states)
 
         hidden_states = hidden_states + residual
         residual = hidden_states
-        hidden_states = hidden_states / 2
+        hidden_states = hidden_states + 1
         hidden_states = self.mlp(hidden_states)
 
         return hidden_states, residual
@@ -156,7 +206,8 @@ def __init__(self, config: LlamaConfig) -> None:
         self.layers = nn.ModuleList(
             [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
 
-        self.embedding_tokens.weight.data.fill_(0.0)
+        # this is the initial value of the hidden states
+        self.embedding_tokens.weight.data.fill_(config.init_value)
 
     def forward(
         self,
@@ -170,6 +221,28 @@ def forward(
         return hidden_states
 
 
+def tractable_computation(input_ids: torch.Tensor,
+                          positions: torch.Tensor,
+                          config: LlamaConfig,
+                          init_value: float = 1.0) -> torch.Tensor:
+    hidden_states = torch.ones(input_ids.size(0),
+                               config.hidden_size,
+                               device=input_ids.device,
+                               dtype=input_ids.dtype) * init_value
+
+    # first layer
+    residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+    hidden_states = (residual + 1)**2
+
+    # following layers
+    for _ in range(config.num_layers - 1):
+        hidden_states = hidden_states + residual
+        residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+        hidden_states = (residual + 1)**2
+
+    return hidden_states
+
+
 @torch.inference_mode
 def run_model(llama_config,
               use_compile: bool,
@@ -213,7 +286,15 @@ def run_model(llama_config,
     del os.environ["VLLM_TORCH_COMPILE_LEVEL"]
     set_compilation_config(None)
 
-    return output.cpu()
+    output = output.cpu()
+
+    if llama_config.tractable_init:
+        expected_output = tractable_computation(input_ids[:2], positions[:2],
+                                                llama_config).cpu()
+
+        assert torch.allclose(output, expected_output)
+    else:
+        return output.cpu()
 
 
 def test_toy_llama():
@@ -222,7 +303,13 @@ def test_toy_llama():
     llama_config = LlamaConfig(hidden_size=128,
                                mlp_size=256,
                                vocab_size=128,
-                               num_layers=2)
+                               num_layers=12)
+
+    tractable_config = LlamaConfig(hidden_size=128,
+                                   mlp_size=256,
+                                   vocab_size=128,
+                                   num_layers=2,
+                                   tractable_init=True)
 
     outputs = []
     with compilation_counter.expect(
@@ -233,6 +320,8 @@ def test_toy_llama():
             num_cudagraph_caputured=0,
     ):
         outputs.append(run_model(llama_config, use_compile=False))
+    run_model(tractable_config, use_compile=False)
+
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
             num_piecewise_graphs_seen=1,
@@ -242,6 +331,7 @@ def test_toy_llama():
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
         outputs.append(run_model(llama_config, use_compile=True))
+    run_model(tractable_config, use_compile=True)
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
@@ -257,6 +347,7 @@ def test_toy_llama():
     ):
         outputs.append(
             run_model(llama_config, use_compile=True, split_attn=True))
+    run_model(tractable_config, use_compile=True, split_attn=True)
 
     for i in range(1, len(outputs)):
         assert torch.allclose(outputs[0], outputs[i])
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 96ddcba467c5b..de32cabbe6d07 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -6,6 +6,7 @@
 import torch
 import torch.fx as fx
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils import weak_ref_tensors
 
@@ -193,6 +194,7 @@ def wrap_inductor(graph,
 @dataclasses.dataclass
 class SplitItem:
     submod_name: str
+    graph_id: int
     is_splitting_graph: bool
     graph: fx.GraphModule
 
@@ -226,9 +228,7 @@ def split_graph(graph: fx.GraphModule,
 
     outputs = []
 
-    # sort the names to make sure the order is deterministic
     names = [name for (name, module) in split_gm.named_modules()]
-    names.sort()
 
     for name in names:
         if "." in name or name == "":
@@ -238,7 +238,11 @@ def split_graph(graph: fx.GraphModule,
         module = getattr(split_gm, name)
 
         graph_id = int(name.replace("submod_", ""))
-        outputs.append(SplitItem(name, graph_id in split_op_graphs, module))
+        outputs.append(
+            SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
+
+    # sort by intetger graph_id, rather than string name
+    outputs.sort(key=lambda x: x.graph_id)
 
     return split_gm, outputs
 
@@ -252,6 +256,11 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     It runs the given graph with fake inputs, and compile some
     submodules specified by `compile_submod_names` with the given
     compilation configs.
+
+    NOTE: the order in `compile_submod_names` matters, because
+    it will be used to determine the order of the compiled piecewise
+    graphs. The first graph will handle logging, and the last graph
+    has some special cudagraph output handling.
     """
 
     def __init__(self, module: torch.fx.GraphModule,
@@ -263,7 +272,6 @@ def __init__(self, module: torch.fx.GraphModule,
         self.compile_submod_names = compile_submod_names
         self.compilation_configs = compilation_configs
         self.graph_pool = graph_pool
-        self.have_seen_first_graph = False
 
     def run(self, *args):
         fake_args = [
@@ -279,6 +287,7 @@ def call_module(self, target: torch.fx.node.Target,
         output = super().call_module(target, args, kwargs)
 
         if target in self.compile_submod_names:
+            index = self.compile_submod_names.index(target)
             submod = self.fetch_attr(target)
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
@@ -288,15 +297,14 @@ def call_module(self, target: torch.fx.node.Target,
                 args,
                 self.compilation_configs.inductor_compile_config,
                 runtime_shape=None,
-                do_logging=not self.have_seen_first_graph,
+                do_logging=index == 0,
                 use_inductor=self.compilation_configs.use_inductor)
 
             self.module.__dict__[target] = PiecewiseBackend(
-                submod, self.compilation_configs, self.graph_pool,
-                not self.have_seen_first_graph, sym_shape_indices,
+                submod, self.compilation_configs, self.graph_pool, index,
+                len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_general_shape)
 
-            self.have_seen_first_graph = True
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
         return output
@@ -352,8 +360,9 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             graph, self.compilation_configs.non_cudagraph_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
-        logger.debug("%s",
-                     lazy_format_graph_code("stiching module", self.split_gm))
+        logger.debug("%s", lazy_format_graph_code("before split", self.graph))
+        logger.debug("%s", lazy_format_graph_code("after split",
+                                                  self.split_gm))
 
         compilation_counter.num_piecewise_graphs_seen += len(
             self.piecewise_graphs)
@@ -385,12 +394,17 @@ class ConcreteSizeEntry:
     cudagraph: Optional[torch.cuda.CUDAGraph] = None
     output: Optional[Any] = None
 
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[List[int]] = None
+
 
 class PiecewiseBackend:
 
     def __init__(self, graph: fx.GraphModule,
                  compilation_configs: CompilationConfig, graph_pool: Any,
-                 is_first_graph: bool, sym_shape_indices: List[int],
+                 piecewise_compile_index: int, total_piecewise_compiles: int,
+                 sym_shape_indices: List[int],
                  compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
@@ -408,7 +422,12 @@ def __init__(self, graph: fx.GraphModule,
         self.graph = graph
         self.compilation_configs = compilation_configs
         self.graph_pool = graph_pool
-        self.is_first_graph = is_first_graph
+        self.piecewise_compile_index = piecewise_compile_index
+        self.total_piecewise_compiles = total_piecewise_compiles
+
+        self.is_first_graph = piecewise_compile_index == 0
+        self.is_last_graph = (
+            piecewise_compile_index == total_piecewise_compiles - 1)
 
         self.compile_sizes: Set[int] = set(
             self.compilation_configs.compile_sizes)
@@ -422,6 +441,8 @@ def __init__(self, graph: fx.GraphModule,
 
         self.sym_shape_indices = sym_shape_indices
 
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
@@ -476,14 +497,45 @@ def __call__(self, *args) -> Any:
                 logger.info("Capturing a cudagraph for shape %s",
                             runtime_shape)
 
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
             cudagraph = torch.cuda.CUDAGraph()
+
+            # mind-exploding: carefully manage the reference and memory.
             with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                entry.output = weak_ref_tensors(entry.runnable(*args))
+                # `output` is managed by pytorch's cudagraph pool
+                output = entry.runnable(*args)
+                if self.is_last_graph:
+                    # by converting it to weak ref,
+                    # the original `output` will immediately be released
+                    # to save memory. It is only safe to do this for
+                    # the last graph, because the output of the last graph
+                    # will not be used by any other cuda graph.
+                    output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
 
             compilation_counter.num_cudagraph_caputured += 1
 
-            entry.cudagraph = cudagraph
-            return entry.output
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
 
         entry.cudagraph.replay()
         return entry.output

From 43300bd98a54d48e97d9fb78c9db88eda3a88c64 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 5 Nov 2024 16:34:40 -0800
Subject: [PATCH 70/85] [Bugfix] Properly propagate trust_remote_code settings
 (#10047)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
---
 vllm/model_executor/models/chatglm.py |  7 ++++---
 vllm/model_executor/models/molmo.py   | 22 ++++++++++++----------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index c3c9ec703c1e6..181f3c2b0fc35 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -54,8 +54,9 @@ def mm_input_mapper_for_glmv(
     data: MultiModalData[object],
 ) -> Dict:
     model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(model_config.tokenizer,
-                                     trust_remote_code=True)
+    tokenizer = cached_get_tokenizer(
+        model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code)
     if tokenizer is None:
         raise RuntimeError("No HuggingFace processor is available "
                            "to process the image object")
@@ -525,7 +526,7 @@ def _parse_and_validate_image_input(
             elif isinstance(pixel_values, list):
                 return torch.concat(pixel_values)
             else:
-                raise TypeError("""pixel_values must be a torch.Tensor 
+                raise TypeError("""pixel_values must be a torch.Tensor
                     or a list of torch.Tensor
                     """)
         return GLMImagePixelInputs(pixel_values=pixel_values)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ba798833e26a9..07c06149f0206 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -844,9 +844,10 @@ def get_max_tokens(max_crops: int, crop_patches: int, left_margin: int,
 
 
 def get_max_molmo_image_tokens(ctx: InputContext) -> int:
-    processor = cached_get_processor(ctx.model_config.model,
-                                     trust_remote_code=True,
-                                     revision=ctx.model_config.code_revision)
+    processor = cached_get_processor(
+        ctx.model_config.model,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+        revision=ctx.model_config.code_revision)
     image_processor = processor.image_processor
     max_llm_image_tokens = get_max_tokens(
         image_processor.max_crops,
@@ -870,9 +871,10 @@ def image_input_mapper_for_molmo(
 
 def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
                          mm_counts: Mapping[str, int]):
-    processor = cached_get_processor(ctx.model_config.model,
-                                     trust_remote_code=True,
-                                     revision=ctx.model_config.code_revision)
+    processor = cached_get_processor(
+        ctx.model_config.model,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+        revision=ctx.model_config.code_revision)
     image_processor = processor.image_processor
 
     base_image_input_d = image_processor.image_patch_size
@@ -935,11 +937,11 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
     multi_modal_data = inputs.get("multi_modal_data")
     image = None if multi_modal_data is None else multi_modal_data.get("image")
 
-    processor = cached_get_processor(ctx.model_config.model,
-                                     trust_remote_code=True,
-                                     revision=ctx.model_config.code_revision)
-
     model_config = ctx.model_config
+    processor = cached_get_processor(
+        ctx.model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=ctx.model_config.code_revision)
     tokenizer = cached_get_tokenizer(
         model_config.tokenizer,
         trust_remote_code=model_config.trust_remote_code)

From 966e31697bdeb47b33b3e26b4aab5999c85f3e90 Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Tue, 5 Nov 2024 21:39:26 -0300
Subject: [PATCH 71/85] [Bugfix] Fix pickle of input when async output
 processing is on (#9931)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 .../test_basic_correctness.py                 | 26 +++++++++++++++++++
 vllm/worker/model_runner.py                   | 12 +++++++++
 2 files changed, 38 insertions(+)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 79647589d5204..7f16baa65a644 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -156,3 +156,29 @@ def test_model_with_failure(vllm_runner) -> None:
                           ModelInputForGPUWithSamplingMetadata)
     finally:
         os.remove(filename)
+
+
+def test_failure_with_async_out_proc(vllm_runner) -> None:
+
+    filename = None
+    try:
+        with vllm_runner("facebook/opt-125m",
+                         dtype="half",
+                         enforce_eager=False,
+                         gpu_memory_utilization=0.7) as vllm_model,\
+             patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
+                       side_effect=ValueError()):
+            model_config = vllm_model.model.llm_engine.model_config
+            assert model_config.use_async_output_proc
+            with pytest.raises(ValueError) as exc_info:
+                vllm_model.generate_greedy('how to make pizza?', 250)
+            matches = re.search(r"input dumped to (.+).pkl",
+                                str(exc_info.value))
+            assert matches is not None
+
+            filename = f"{matches.group(1)}.pkl"
+    finally:
+        # Clean up
+        if filename is not None:
+            os.remove(filename)
+        pass
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2447eecf7957d..1e8ea4e8e79cf 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -136,6 +136,18 @@ def from_broadcasted_tensor_dict(
                 attn_backend, tensor_dict)
         return cls(**tensor_dict)
 
+    # Exclude `async_callback` to be able to pickle this object
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["async_callback"]
+        return state
+
+    # TODO: What happens when we depickle this object?
+    # How can we update this callback to properly pass it to the engine?
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.__dict__.update({'async_callback': None})
+
 
 @dataclass(frozen=True)
 class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):

From 0c63c34f725f0b519fa094fbeca6e3cf12c911c1 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Wed, 6 Nov 2024 10:45:45 +0900
Subject: [PATCH 72/85] [Bugfix][SpecDecode] kv corruption with bonus tokens in
 spec decode (#9730)

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
---
 tests/spec_decode/test_multi_step_worker.py | 107 ++++++++++++++++++++
 tests/spec_decode/utils.py                  |   4 +-
 vllm/spec_decode/draft_model_runner.py      |  35 ++++++-
 vllm/spec_decode/multi_step_worker.py       |  23 ++++-
 4 files changed, 159 insertions(+), 10 deletions(-)

diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index e6f7f480eebb2..0b5d82b6610ca 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -5,6 +5,8 @@
 import pytest
 import torch
 
+from vllm.attention.selector import (_Backend,
+                                     global_force_attn_backend_context_manager)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
@@ -303,6 +305,7 @@ def test_multi_step_with_batch_expansion_correct_output():
         seed,
         model_runner_cls=TP1DraftModelRunner,
     )
+    multi_step_worker.set_include_gpu_probs_tensor()
     worker = create_worker(
         Worker,
         model_name,
@@ -397,6 +400,7 @@ def test_multi_step_with_batch_expansion_incorrect_output():
         seed,
         model_runner_cls=TP1DraftModelRunner,
     )
+    multi_step_worker.set_include_gpu_probs_tensor()
     worker = create_worker(
         Worker,
         model_name,
@@ -477,6 +481,109 @@ def test_multi_step_with_batch_expansion_incorrect_output():
     assert (num_mismatch > 0)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
+# The choice of backends forces the multi_step_worker to choose between
+# the vanilla model_runner and TP1DraftModelRunner and that we can test
+# both code paths.
+@pytest.mark.parametrize('attn_backend',
+                         [_Backend.XFORMERS, _Backend.FLASH_ATTN])
+def test_multi_step_correct_kvcache(num_steps, attn_backend):
+    """Verify that the KV cache of the draft model 
+    is correctly updated for sequences with bonus token.
+    """
+    seed = 100
+    model_name = "JackFram/llama-68m"
+
+    block_size = 16
+    num_gpu_blocks = 2048 // block_size
+    batch_size = 1
+
+    with global_force_attn_backend_context_manager(attn_backend):
+        dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
+        multi_step_worker = create_worker(MultiStepWorker,
+                                          model_name,
+                                          block_size,
+                                          num_gpu_blocks,
+                                          seed,
+                                          model_runner_cls=TP1DraftModelRunner,
+                                          dtype=dtype)
+        multi_step_worker.set_include_gpu_probs_tensor()
+        worker = create_worker(Worker,
+                               model_name,
+                               block_size,
+                               num_gpu_blocks,
+                               seed,
+                               dtype=dtype)
+
+        prompts = [[0] for _ in range(batch_size)]
+        # Already generate two tokens for the sequence
+        # so that we can simulate the bonus token case
+        multi_step_continuations = [[
+            random.randint(0, 1000),
+            random.randint(0, 1000)
+        ] for _ in prompts]
+        final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
+
+        seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=multi_step_continuations,
+            final_prompt_lens=final_prompt_lens)
+
+        # Run multi-step.
+        zero_kv_cache(multi_step_worker.cache_engine)
+        multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list),
+                                         sample_len=num_steps,
+                                         seq_ids_with_bonus_token_in_last_step=
+                                         seq_ids_with_bonus_token_in_last_step)
+
+        # Run single-step repeatedly.
+        zero_kv_cache(worker.cache_engine)
+        # Generate the kv cache for the bonus token first
+        single_step_continuations = [c[:1] for c in multi_step_continuations]
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=single_step_continuations,
+            final_prompt_lens=final_prompt_lens)
+        single_step_output = worker.execute_model(
+            execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list))
+        for _ in range(num_steps):
+            seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+                prompts,
+                num_gpu_blocks,
+                block_size,
+                continuations=multi_step_continuations,
+                final_prompt_lens=final_prompt_lens)
+
+            single_step_output = worker.execute_model(
+                execute_model_req=ExecuteModelRequest(
+                    seq_group_metadata_list=seq_group_metadata_list))
+
+            for i, seq_group_output in enumerate(single_step_output[-1]):
+                multi_step_continuations[i].append(
+                    seq_group_output.samples[0].output_token)
+
+        # Verify that the KV cache of the single-step and
+        # multi-step workers are the same.
+        single_step_gpu_cache = worker.cache_engine[0].gpu_cache
+        multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
+        num_layers = len(single_step_gpu_cache)
+        allclose = lambda a, b: torch.allclose(
+            a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
+        for i in range(num_layers):
+            assert allclose(single_step_gpu_cache[i][0],
+                            multi_step_gpu_cache[i][0])
+            assert allclose(single_step_gpu_cache[i][1],
+                            multi_step_gpu_cache[i][1])
+
+
 @torch.inference_mode()
 def test_draft_proposals_full_speculation_len():
     """Verify Top1Proposer correctly handles case where all sequences
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 6cf0cfb09b8fa..e5cb0530f9961 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -68,12 +68,14 @@ def create_worker(cls: Callable[..., T],
                   seed: int,
                   is_driver_worker: bool = True,
                   enforce_eager: bool = True,
-                  model_runner_cls: Optional[ModelRunner] = None) -> T:
+                  model_runner_cls: Optional[ModelRunner] = None,
+                  dtype: Optional[str] = "auto") -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
         block_size=block_size,
         enforce_eager=enforce_eager,
+        dtype=dtype,
     )
     engine_config = engine_args.create_engine_config()
 
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 17cc0ad1a4a3a..6330ac027db74 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -54,6 +54,8 @@ def __init__(self, *args, **kwargs):
 
         super().__init__(*args, **kwargs)
 
+        self.indices_of_seq_with_bonus_tokens = None
+
     def _update_sampling_metadata(self, sampling_metadata, num_seqs,
                                   num_queries):
 
@@ -159,6 +161,10 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
         # TODO: Add soft-tuning prompt adapter support
         return not self.prompt_adapter_config
 
+    def set_indices_of_seq_with_bonus_tokens(self,
+                                             indices_of_seq_with_bonus_tokens):
+        self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -284,11 +290,30 @@ def execute_model(
                                                model_input.sampling_metadata)
 
             # Sample the next token.
-            outputs.append(
-                self.model.sample(
-                    logits=logits,
-                    sampling_metadata=model_input.sampling_metadata,
-                ))
+            output = self.model.sample(
+                logits=logits,
+                sampling_metadata=model_input.sampling_metadata,
+            )
+            outputs.append(output)
+
+            if model_input.attn_metadata.num_prefills == 0 \
+                and self.indices_of_seq_with_bonus_tokens is not None:
+                assert output.sampled_token_ids is not None
+                # output.sampled_token_ids should be of shape (num_seqs, 1)
+                nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape
+                assert num_tokens_per_seq == 1
+                count = 0
+                for i in range(nums_seqs):
+                    bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[
+                        count]
+                    if i != bonus_seq_idx:
+                        # The following might cause a cpu->gpu sync
+                        # However, the performance impact is negligible as we
+                        # benchmarked on H100.
+                        output.sampled_token_ids[
+                            i, :] = model_input.input_tokens[bonus_seq_idx]
+                    else:
+                        count += 1
 
             # Prepare inputs for the next step
             if step != num_steps - 1:
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 4b53fbe056c47..f49b98f5c9528 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -81,6 +81,8 @@ def sampler_output(
             # Here we run the draft_model_runner with multi-step prepare
             # on the GPU directly
             expanded_request.num_steps = sample_len
+            self.model_runner.set_indices_of_seq_with_bonus_tokens(
+                indices_of_seq_with_bonus_tokens)
             model_outputs = self.execute_model(
                 execute_model_req=expanded_request)
         else:
@@ -97,7 +99,8 @@ def sampler_output(
                 model_output = model_output[0]
 
                 self._append_new_tokens(
-                    model_output, expanded_request.seq_group_metadata_list)
+                    model_output, expanded_request.seq_group_metadata_list,
+                    indices_of_seq_with_bonus_tokens)
                 model_outputs.append(model_output)
 
         filtered_model_outputs = self._filter_model_output(
@@ -221,13 +224,15 @@ def get_spec_proposals(
     @staticmethod
     def _append_new_tokens(
             model_output: List[SamplerOutput],
-            seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            indices_of_seq_with_bonus_tokens: List[int]) -> None:
         """Given model output from a single run, append the tokens to the
         sequences. This is normally done outside of the worker, but it is
         required if the worker is to perform multiple forward passes.
         """
-        for seq_group_metadata, sequence_group_outputs in zip(
-                seq_group_metadata_list, model_output):
+        count = 0
+        for index, (seq_group_metadata, sequence_group_outputs) in enumerate(
+                zip(seq_group_metadata_list, model_output)):
             seq_group_metadata.is_prompt = False
 
             for seq_output in sequence_group_outputs.samples:
@@ -237,6 +242,16 @@ def _append_new_tokens(
 
                 token_id = seq_output.output_token
                 token_logprob = seq_output.logprobs[token_id]
+                # Determine the actual token ID to be generated,
+                # considering bonus tokens
+                if index != indices_of_seq_with_bonus_tokens[count]:
+                    bonus_seq_metadata = seq_group_metadata_list[
+                        indices_of_seq_with_bonus_tokens[count]]
+                    _, bonus_token_seq_data = next(
+                        iter(bonus_seq_metadata.seq_data.items()))
+                    token_id = bonus_token_seq_data.output_token_ids[-1]
+                else:
+                    count += 1
 
                 seq.append_token_id(token_id, token_logprob.logprob)
                 seq.update_num_computed_tokens(1)

From c4cacbaa7faf9d0d3b2aa26e5df496724e80cb05 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 5 Nov 2024 18:19:50 -0800
Subject: [PATCH 73/85] [v1] reduce graph capture time for piecewise cudagraph
 (#10059)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index de32cabbe6d07..05deee7bd5473 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,7 +1,9 @@
 import copy
 import dataclasses
 import operator
+from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
 
 import torch
 import torch.fx as fx
@@ -503,17 +505,29 @@ def __call__(self, *args) -> Any:
             entry.input_addresses = input_addresses
             cudagraph = torch.cuda.CUDAGraph()
 
-            # mind-exploding: carefully manage the reference and memory.
-            with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                # `output` is managed by pytorch's cudagraph pool
-                output = entry.runnable(*args)
-                if self.is_last_graph:
-                    # by converting it to weak ref,
-                    # the original `output` will immediately be released
-                    # to save memory. It is only safe to do this for
-                    # the last graph, because the output of the last graph
-                    # will not be used by any other cuda graph.
-                    output = weak_ref_tensors(output)
+            with ExitStack() as stack:
+                if not self.is_first_graph:
+                    # during every model forward, we will capture
+                    # many pieces of cudagraphs (roughly one per layer).
+                    # running gc again and again across layers will
+                    # make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.cuda.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    if self.is_last_graph:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph, because the output of the last graph
+                        # will not be used by any other cuda graph.
+                        output = weak_ref_tensors(output)
 
             # here we always use weak ref for the output
             # to save memory

From 82bfc38d079b1ef5f4b88ac7094a00029d2e99af Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 6 Nov 2024 12:05:05 +0800
Subject: [PATCH 74/85] [Misc] Sort the list of embedding models (#10037)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/registry.py | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index af52fbffba19e..792c6cec34ae0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -94,33 +94,23 @@
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
+    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
     "LlamaModel": ("llama", "LlamaEmbeddingModel"),
+    **{
+        # Multiple models share the same architecture, so we include them all
+        k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
+        if arch == "LlamaForCausalLM"
+    },
     "MistralModel": ("llama", "LlamaEmbeddingModel"),
-    "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
-    "Qwen2ForSequenceClassification": (
-        "qwen2_cls", "Qwen2ForSequenceClassification"),
-    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
+    "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
 }
 
-def add_embedding_models(base_models, embedding_models):
-    with_pooler_method_models = {}
-    embedding_models_name = embedding_models.keys()
-    for name, (path, arch) in base_models.items():
-        if arch in embedding_models_name:
-            with_pooler_method_models[name] = (path, arch)
-    return with_pooler_method_models
-
-_EMBEDDING_MODELS = {
-    **add_embedding_models(_TEXT_GENERATION_MODELS, _EMBEDDING_MODELS),
-    **_EMBEDDING_MODELS,
-}
-
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),

From ffc0f2b47add6e0f70e2b5d4b4aaac64ee97f8ad Mon Sep 17 00:00:00 2001
From: Peter Salas <peter@fixie.ai>
Date: Tue, 5 Nov 2024 20:19:15 -0800
Subject: [PATCH 75/85] [Model][OpenVINO] Fix regressions from #8346 (#10045)

Signed-off-by: Peter Salas <peter@fixie.ai>
---
 .buildkite/run-openvino-test.sh     |  2 +-
 vllm/attention/backends/openvino.py | 12 +++++++++++-
 vllm/model_executor/models/molmo.py |  6 +++---
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 70e56596c4a86..35ad5c0ddde77 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
index 6fddfc2002120..be06d16009988 100644
--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import openvino as ov
 import torch
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.multimodal import MultiModalPlaceholderMap
 
 
 def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
@@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
     # Shape: scalar
     # Type: i32
     max_context_len: torch.Tensor
+
+    # The index maps that relate multi-modal embeddings to the corresponding
+    # placeholders.
+    #
+    # N.B. These aren't really related to attention and don't belong on this
+    # type -- this is just a temporary solution to make them available to
+    # `model_executable`.
+    multi_modal_placeholder_index_maps: Optional[Dict[
+        str, MultiModalPlaceholderMap.IndexMap]]
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 07c06149f0206..522aa748f78b6 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -21,8 +21,8 @@
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather)
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
     dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
-    return dummy_seqdata, {"image": dummy_imgdata}
+    return DummyData(dummy_seqdata, {"image": dummy_imgdata})
 
 
 def pad_images(

From 2bcbae704c0d52913c6a2887260fc6bde6c20361 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 5 Nov 2024 21:28:29 -0700
Subject: [PATCH 76/85] [Bugfix] Fix edge-case crash when using chat with the
 Mistral Tekken Tokenizer (#10051)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 tests/models/decoder_only/language/test_mistral.py | 9 ++++++---
 vllm/transformers_utils/tokenizers/mistral.py      | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 5be44c54a717c..6ec4b7e7e3f71 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -10,19 +10,22 @@
 
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    # Mistral-Nemo is to big for CI, but passes locally
-    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
 MISTRAL_FORMAT_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
+    # uses the v3-Tekken tokenizer
+    "mistralai/Ministral-8B-Instruct-2410",
+    # Mistral-Nemo is to big for CI, but passes locally
+    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
     "寫一首關於勇敢的水手的詩",  # chinese
+    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
+    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
 ]
 
 # for function calling
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 896f70bc1dafd..ccffdcc2a4df2 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -254,7 +254,7 @@ def decode(self,
                skip_special_tokens: bool = True) -> str:
         assert (
             skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
 
         if isinstance(ids, int):
             ids = [ids]
@@ -268,12 +268,16 @@ def convert_ids_to_tokens(
         # TODO(Patrick) - potentially allow special tokens to not be skipped
         assert (
             skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
 
         assert isinstance(self.tokenizer,
                           (Tekkenizer, SentencePieceTokenizer)), type(
                               self.tokenizer)
 
+        if isinstance(self.tokenizer, Tekkenizer):
+            # skip special tokens
+            ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
+
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
         if any("�" in t for t in tokens):

From ea928f608c44b825d28609460e0d375a5f877940 Mon Sep 17 00:00:00 2001
From: arakowsk-amd <182798202+arakowsk-amd@users.noreply.github.com>
Date: Tue, 5 Nov 2024 21:10:40 -0800
Subject: [PATCH 77/85] [Bugfix] Gpt-j-6B patch kv_scale to k_scale path 
 (#10063)

Signed-off-by: Alex Rakowski <alex.rakowski@amd.com>
Signed-off-by: Alex Rakowski <182798202+arakowsk-amd@users.noreply.github.com>
---
 vllm/model_executor/models/gpt_j.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 0451d16b6c738..9a42b359ae44f 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -36,7 +36,8 @@
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -308,6 +309,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue

From 9d59b755934899b7ec5d7bb5b90d15bfd2302475 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 5 Nov 2024 21:13:09 -0800
Subject: [PATCH 78/85] [Bugfix] Remove CustomChatCompletionContentPartParam
 multimodal input type (#10054)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 8da08d4b2c93c..2b339ab6d44e4 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -22,7 +22,6 @@
                                ChatCompletionToolMessageParam)
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
-from pydantic import ConfigDict
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import Required, TypeAlias, TypedDict
 
@@ -52,17 +51,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
     """The type of the content part."""
 
 
-class CustomChatCompletionContentPartParam(TypedDict, total=False):
-    __pydantic_config__ = ConfigDict(extra="allow")  # type: ignore
-
-    type: Required[str]
-    """The type of the content part."""
-
-
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain image_url.
     This is supported by OpenAI API, although it is not documented.
-    
+
     Example:
     {
         "image_url": "https://example.com/image.jpg"
@@ -73,7 +65,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
 
 class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain audio_url.
-    
+
     Example:
     {
         "audio_url": "https://example.com/audio.mp3"
@@ -85,7 +77,6 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartRefusalParam,
-    CustomChatCompletionContentPartParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam, str]
 

From 40899855520eb9497606bdb2b1b4e619233e598a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 5 Nov 2024 22:16:04 -0800
Subject: [PATCH 79/85] [V1] Integrate Piecewise CUDA graphs (#10058)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/compilation/backends.py             |   7 +-
 vllm/v1/attention/backends/flash_attn.py |  35 ++++---
 vllm/v1/worker/gpu_model_runner.py       | 127 +++++++++++++++++++----
 3 files changed, 133 insertions(+), 36 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 05deee7bd5473..abd1d16accaf7 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -496,8 +496,11 @@ def __call__(self, *args) -> Any:
                 return entry.runnable(*args)
 
             if self.is_first_graph:
-                logger.info("Capturing a cudagraph for shape %s",
-                            runtime_shape)
+                # Since we capture cudagraph for many different shapes and
+                # capturing is fast, we don't need to log it for every shape.
+                # We only log it in the debug mode.
+                logger.debug("Capturing a cudagraph for shape %s",
+                             runtime_shape)
 
             input_addresses = [
                 x.data_ptr() for x in args if isinstance(x, torch.Tensor)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index b2af89ebf854a..906f06777a136 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -51,6 +51,7 @@ class FlashAttentionMetadata:
     # |-------------------- seq_len ---------------------|
     #                                   |-- query_len ---|
 
+    num_actual_tokens: int  # Number of tokens excluding padding.
     max_query_len: int
     query_start_loc: torch.Tensor
     max_seq_len: int
@@ -134,7 +135,9 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
-        output = torch.ops.vllm.unified_flash_attention(
+        output = torch.empty_like(query)
+        torch.ops.vllm.unified_flash_attention(
+            output,
             query,
             key,
             value,
@@ -154,6 +157,7 @@ def forward(
 
 
 def unified_flash_attention(
+    output: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -168,17 +172,17 @@ def unified_flash_attention(
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
+) -> None:
     current_metadata = get_forward_context()
     if current_metadata is None:
         # Profiling run.
-        return torch.empty_like(query)
+        return
 
     assert current_metadata is not None
     assert isinstance(current_metadata, FlashAttentionMetadata)
     attn_metadata: FlashAttentionMetadata = current_metadata
+    num_actual_tokens = attn_metadata.num_actual_tokens
 
-    num_tokens, hidden_size = query.shape
     # Reshape the query, key, and value tensors.
     query = query.view(-1, num_heads, head_size)
     key = key.view(-1, num_kv_heads, head_size)
@@ -188,18 +192,18 @@ def unified_flash_attention(
     key_cache = kv_cache[0]
     value_cache = kv_cache[1]
     torch.ops._C_cache_ops.reshape_and_cache_flash(
-        key,
-        value,
-        kv_cache[0],
-        kv_cache[1],
+        key[:num_actual_tokens],
+        value[:num_actual_tokens],
+        key_cache,
+        value_cache,
         attn_metadata.slot_mapping,
         kv_cache_dtype,
         k_scale,
         v_scale,
     )
 
-    output = flash_attn_varlen_func(
-        q=query,
+    attn_output = flash_attn_varlen_func(
+        q=query[:num_actual_tokens],
         k=key_cache,
         v=value_cache,
         cu_seqlens_q=attn_metadata.query_start_loc,
@@ -213,10 +217,13 @@ def unified_flash_attention(
         block_table=attn_metadata.block_table,
         softcap=logits_soft_cap,
     )
-    return output.view(num_tokens, hidden_size)
+    attn_output = attn_output.view(num_actual_tokens, -1)
+    # TODO(woosuk): Optimize this.
+    output[:num_actual_tokens].copy_(attn_output)
 
 
 def unified_flash_attention_fake(
+    output: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -231,13 +238,13 @@ def unified_flash_attention_fake(
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
-    return torch.empty_like(query)
+) -> None:
+    return
 
 
 direct_register_custom_op(
     op_name="unified_flash_attention",
     op_func=unified_flash_attention,
-    mutates_args=["kv_cache"],
+    mutates_args=["kv_cache", "output"],
     fake_impl=unified_flash_attention_fake,
 )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ae4239f8e1fab..63bf7c2e605a2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,3 +1,5 @@
+import os
+import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set
 from unittest.mock import patch
@@ -7,11 +9,16 @@
 import torch.distributed
 import torch.nn as nn
 
+from vllm import envs
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.levels import CompilationLevel
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MultiModalDataDict
+from vllm.plugins import set_compilation_config
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
                         is_pin_memory_available)
@@ -86,6 +93,18 @@ def __init__(
             pin_memory=self.pin_memory,
         )
 
+        self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL
+                               == CompilationLevel.PIECEWISE
+                               and not self.model_config.enforce_eager)
+        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
+        self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)]
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=self.device)
+        self.positions = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int64,
+                                     device=self.device)
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
@@ -268,12 +287,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         seq_start_loc_np[0] = 0
         np.cumsum(seq_lens, out=seq_start_loc_np[1:])
 
-        input_ids = input_ids.to(self.device, non_blocking=True)
-        positions = positions.to(self.device, non_blocking=True).long()
+        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
+                                                          non_blocking=True)
+        self.positions[:total_num_scheduled_tokens].copy_(positions,
+                                                          non_blocking=True)
+
         query_start_loc = query_start_loc.to(self.device, non_blocking=True)
         seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
@@ -287,7 +310,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return input_ids, positions, attn_metadata, logits_indices
+        return attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -310,16 +333,26 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
-        inputs = self._prepare_inputs(scheduler_output)
-        input_ids, positions, attn_metadata, logits_indices = inputs
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        if (self.use_cuda_graph
+                and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
+            # Use piecewise CUDA graphs.
+            # Add padding to the batch size.
+            num_input_tokens = self._get_padded_batch_size(
+                num_scheduled_tokens)
+        else:
+            # Eager mode.
+            num_input_tokens = num_scheduled_tokens
 
         with set_forward_context(attn_metadata):
             hidden_states = self.model(
-                input_ids=input_ids,
-                positions=positions,
+                input_ids=self.input_ids[:num_input_tokens],
+                positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
-                attn_metadata=attn_metadata,
+                attn_metadata=None,
             )
+        hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(hidden_states, None)
 
@@ -371,6 +404,18 @@ def execute_model(
         return model_runner_output
 
     def load_model(self) -> None:
+        if self.use_cuda_graph:
+            # FIXME(woosuk): Currently, the custom ops are not supported
+            # in the piecewise compilation mode. We rely on TorchInductor
+            # to optimize the model.
+            os.environ["VLLM_CUSTOM_OPS"] = "none"
+            set_compilation_config(
+                CompilationConfig(
+                    use_cudagraph=True,
+                    non_cudagraph_ops=["vllm.unified_flash_attention"],
+                    use_inductor=True,
+                ))
+
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             with patch("vllm.model_executor.layers.sampler.Sampler", Sampler):
@@ -381,26 +426,61 @@ def load_model(self) -> None:
                     self.model_memory_usage / float(2**30))
 
     def _dummy_run(self, model: nn.Module, num_tokens: int) -> None:
-        input_ids = torch.zeros(num_tokens,
-                                dtype=torch.int32,
-                                device=self.device)
-        positions = torch.zeros(num_tokens,
-                                dtype=torch.long,
-                                device=self.device)
-        kv_caches = [None for _ in range(self.num_attn_layers)]
-        model(input_ids, positions, kv_caches, attn_metadata=None)
-        return
+        # use an empty tensor instead of `None`` to force Dynamo to pass
+        # it by reference, rather by specializing on the value `None`.
+        # the `dtype` argument does not matter, and we use `float32` as
+        # a placeholder (it has wide hardware support).
+        # it is important to create tensors inside the loop, rather than
+        # multiplying the list, to avoid Dynamo from treating them as
+        # tensor aliasing.
+        dummy_kv_caches = [
+            torch.tensor([], dtype=torch.float32, device=self.device)
+            for _ in range(self.num_attn_layers)
+        ]
+        with set_forward_context(None):  # noqa: SIM117
+            with set_compile_context(self.cudagraph_batch_sizes):
+                # Trigger compilation for general shape.
+                model(self.input_ids,
+                      self.positions,
+                      dummy_kv_caches,
+                      attn_metadata=None)
 
     @torch.inference_mode()
     def profile_run(self) -> None:
         self._dummy_run(self.model, self.max_num_tokens)
         torch.cuda.synchronize()
-        return
 
     @torch.inference_mode()
     def capture_model(self) -> None:
-        # TODO: Implement CUDA graph support.
-        return
+        if not self.use_cuda_graph:
+            logger.warning(
+                "Skipping CUDA graph capture. Please set "
+                "VLLM_TORCH_COMPILE_LEVEL=%d to use CUDA graphs.",
+                CompilationLevel.PIECEWISE)
+            return
+
+        start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        with set_forward_context(None):
+            # Trigger CUDA graph capture for specific shapes.
+            # Capture the large shapes first so that the smaller shapes
+            # can reuse the memory pool allocated for the large shapes.
+            for num_tokens in reversed(self.cudagraph_batch_sizes):
+                self.model(
+                    self.input_ids[:num_tokens],
+                    self.positions[:num_tokens],
+                    kv_caches=self.kv_caches,
+                    attn_metadata=None,
+                )
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes 5~20 seconds.
+        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+                    elapsed_time, cuda_graph_size / (1 << 30))
 
     def initialize_kv_cache(self, num_blocks: int) -> None:
         assert len(self.kv_caches) == 0
@@ -412,6 +492,13 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                             dtype=self.kv_cache_dtype,
                             device=self.device))
 
+    def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
+        # TODO: Optimize this?
+        for size in self.cudagraph_batch_sizes:
+            if batch_size <= size:
+                return size
+        return None
+
 
 @dataclass
 class CachedRequestState:

From 4be3a45158a7fb707973d4b00410e0d2981e6825 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 5 Nov 2024 22:35:03 -0800
Subject: [PATCH 80/85] [distributed] add function to create ipc buffers
 directly (#10064)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/distributed/test_ca_buffer_sharing.py   | 59 +++++++++++++++++++
 .../device_communicators/custom_all_reduce.py | 31 ++++++++++
 3 files changed, 91 insertions(+)
 create mode 100644 tests/distributed/test_ca_buffer_sharing.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 1eb749f64d36b..3e940549862ea 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -510,6 +510,7 @@ steps:
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
   - pytest -v -s -x lora/test_mixtral.py
 
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
new file mode 100644
index 0000000000000..fc4043cd3014e
--- /dev/null
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -0,0 +1,59 @@
+# can only run on machines with p2p access across GPUs
+# can only run with torchrun:
+# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
+
+import ctypes
+
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
+    CustomAllreduce)
+
+# create a cpu process group for communicating metadata (ipc handle)
+dist.init_process_group(backend="gloo")
+rank = local_rank = dist.get_rank()
+world_size = dist.get_world_size()
+
+# every process sets its own device (differently)
+lib = CudaRTLibrary()
+lib.cudaSetDevice(rank)
+
+buffer_size_in_bytes = 1024
+byte_value = 2  # the value we write to the buffer for verification
+
+pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
+
+print(f"Rank {rank} has pointers {pointers}")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+if rank == 0:
+    # the first rank tries to write to all buffers
+    for p in pointers:
+        pointer = ctypes.c_void_p(p)
+        lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
+
+dist.barrier()
+torch.cuda.synchronize()
+
+host_data = (ctypes.c_char * buffer_size_in_bytes)()
+
+# all ranks read from all buffers, and check if the data is correct
+for p in pointers:
+    pointer = ctypes.c_void_p(p)
+    lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
+    for i in range(buffer_size_in_bytes):
+        assert ord(host_data[i]) == byte_value, (
+            f"Rank {rank} failed"
+            f" to verify buffer {p}. Expected {byte_value}, "
+            f"got {ord(host_data[i])}")
+
+print(f"Rank {rank} verified all buffers")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+CustomAllreduce.free_shared_buffer(pointers)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index c3632aee6d11a..3b5d92561cf25 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,3 +1,4 @@
+import ctypes
 from contextlib import contextmanager
 from typing import Any, List, Optional, Union
 
@@ -7,6 +8,7 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
@@ -174,6 +176,35 @@ def __init__(self,
                                        offsets, rank, self.full_nvlink)
         self.register_buffer(self.buffer)
 
+    @staticmethod
+    def create_shared_buffer(
+            size_in_bytes: int,
+            group: Optional[ProcessGroup] = None) -> List[int]:
+        lib = CudaRTLibrary()
+        pointer = lib.cudaMalloc(size_in_bytes)
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer.value)  # type: ignore
+            else:
+                pointers.append(
+                    lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
+
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(pointers: List[int],
+                           group: Optional[ProcessGroup] = None) -> None:
+        rank = dist.get_rank(group=group)
+        lib = CudaRTLibrary()
+        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
+
     @contextmanager
     def capture(self):
         """

From 21063c11c7d340dbb01460e22d98d3619737cd4d Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Wed, 6 Nov 2024 02:11:55 -0500
Subject: [PATCH 81/85] [CI/Build] drop support for  Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 .../convert-results-json-to-markdown.py       | 10 +--
 .../scripts/generate-nightly-markdown.py      |  4 +-
 .../scripts/summary-nightly-results.py        |  4 +-
 .github/workflows/mypy.yaml                   |  2 +-
 .github/workflows/publish.yml                 |  2 +-
 .github/workflows/ruff.yml                    | 32 ++++-----
 .github/workflows/yapf.yml                    | 26 ++++----
 .readthedocs.yaml                             | 11 ++--
 CMakeLists.txt                                | 36 +++++-----
 benchmarks/backend_request_func.py            | 22 ++-----
 benchmarks/kernels/benchmark_machete.py       |  6 +-
 csrc/quantization/machete/generate.py         |  8 +--
 docs/source/getting_started/installation.rst  | 10 +--
 pyproject.toml                                |  4 +-
 setup.py                                      |  9 ++-
 tests/compile/piecewise/test_toy_llama.py     |  4 +-
 tests/conftest.py                             | 29 +++-----
 tests/core/block/test_prefix_caching_block.py | 12 ++--
 tests/kernels/test_mamba_ssm.py               |  2 +-
 .../mm_processor_kwargs/test_qwen.py          |  2 +-
 tests/samplers/test_rejection_sampler.py      | 10 ++-
 tests/test_logger.py                          |  2 +-
 tests/tokenization/test_detokenize.py         |  4 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py |  2 +-
 tools/report_build_time_ninja.py              | 32 ++++-----
 use_existing_torch.py                         |  2 +-
 .../ops/blocksparse_attention/interface.py    |  6 +-
 vllm/config.py                                |  7 +-
 vllm/core/evictor.py                          |  2 +-
 .../custom_all_reduce_utils.py                |  2 +-
 vllm/engine/async_llm_engine.py               |  2 +-
 vllm/engine/llm_engine.py                     |  4 +-
 vllm/engine/metrics_types.py                  |  2 +-
 vllm/engine/output_processor/multi_step.py    |  2 +-
 vllm/entrypoints/chat_utils.py                |  2 +-
 vllm/entrypoints/openai/run_batch.py          |  2 +-
 vllm/executor/ray_gpu_executor.py             |  2 +-
 vllm/logger.py                                |  3 +-
 vllm/lora/models.py                           |  4 +-
 vllm/model_executor/custom_op.py              |  2 +-
 vllm/model_executor/layers/resampler.py       |  1 -
 .../model_executor/layers/rotary_embedding.py |  1 -
 vllm/model_executor/model_loader/loader.py    |  2 +-
 vllm/model_executor/model_loader/openvino.py  |  2 +-
 .../model_executor/model_loader/tensorizer.py |  5 +-
 .../model_loader/weight_utils.py              |  9 ++-
 vllm/model_executor/models/arctic.py          |  4 +-
 vllm/model_executor/models/baichuan.py        |  1 -
 vllm/model_executor/models/bloom.py           |  1 -
 vllm/model_executor/models/chatglm.py         |  1 -
 vllm/model_executor/models/commandr.py        |  1 -
 vllm/model_executor/models/dbrx.py            |  1 -
 vllm/model_executor/models/decilm.py          |  1 -
 vllm/model_executor/models/deepseek.py        |  1 -
 vllm/model_executor/models/deepseek_v2.py     |  1 -
 vllm/model_executor/models/exaone.py          |  1 -
 vllm/model_executor/models/falcon.py          |  1 -
 vllm/model_executor/models/fuyu.py            |  1 -
 vllm/model_executor/models/gemma.py           |  1 -
 vllm/model_executor/models/gemma2.py          |  1 -
 .../models/glm4_vision_encoder.py             |  1 -
 vllm/model_executor/models/gpt2.py            |  1 -
 vllm/model_executor/models/gpt_bigcode.py     |  1 -
 vllm/model_executor/models/gpt_j.py           |  1 -
 vllm/model_executor/models/gpt_neox.py        |  1 -
 vllm/model_executor/models/granite.py         |  1 -
 vllm/model_executor/models/granitemoe.py      |  1 -
 .../models/idefics2_vision_model.py           |  2 -
 vllm/model_executor/models/internlm2.py       |  1 -
 vllm/model_executor/models/internlm2_ve.py    |  1 -
 vllm/model_executor/models/jais.py            |  1 -
 vllm/model_executor/models/jamba.py           |  1 -
 vllm/model_executor/models/llama.py           |  1 -
 vllm/model_executor/models/mamba.py           |  1 -
 vllm/model_executor/models/minicpm.py         |  1 -
 vllm/model_executor/models/minicpm3.py        |  1 -
 vllm/model_executor/models/minicpmv.py        |  1 -
 vllm/model_executor/models/mixtral.py         |  1 -
 vllm/model_executor/models/mixtral_quant.py   |  1 -
 vllm/model_executor/models/mllama.py          |  1 -
 vllm/model_executor/models/mlp_speculator.py  |  2 +-
 vllm/model_executor/models/molmo.py           |  6 +-
 vllm/model_executor/models/mpt.py             |  1 -
 vllm/model_executor/models/nemotron.py        |  1 -
 vllm/model_executor/models/olmo.py            |  1 -
 vllm/model_executor/models/opt.py             |  1 -
 vllm/model_executor/models/orion.py           |  1 -
 vllm/model_executor/models/persimmon.py       |  1 -
 vllm/model_executor/models/phi.py             |  1 -
 vllm/model_executor/models/phi3.py            |  1 -
 vllm/model_executor/models/phi3v.py           |  1 -
 vllm/model_executor/models/phimoe.py          |  1 -
 vllm/model_executor/models/pixtral.py         | 10 +--
 vllm/model_executor/models/qwen.py            |  1 -
 vllm/model_executor/models/qwen2.py           |  7 +-
 vllm/model_executor/models/qwen2_audio.py     |  1 -
 vllm/model_executor/models/qwen2_cls.py       |  7 +-
 vllm/model_executor/models/qwen2_moe.py       |  1 -
 vllm/model_executor/models/qwen2_rm.py        |  7 +-
 vllm/model_executor/models/qwen2_vl.py        | 10 ++-
 vllm/model_executor/models/solar.py           |  1 -
 vllm/model_executor/models/stablelm.py        |  1 -
 vllm/model_executor/models/starcoder2.py      |  1 -
 vllm/model_executor/models/xverse.py          |  1 -
 vllm/multimodal/base.py                       | 66 +++++++++++--------
 vllm/prompt_adapter/utils.py                  | 17 +++--
 vllm/transformers_utils/config.py             |  2 +-
 vllm/transformers_utils/configs/chatglm.py    |  1 -
 vllm/transformers_utils/configs/exaone.py     |  1 -
 vllm/transformers_utils/configs/jais.py       |  1 -
 vllm/transformers_utils/configs/mpt.py        |  7 +-
 vllm/transformers_utils/configs/nemotron.py   |  7 +-
 vllm/transformers_utils/configs/solar.py      |  1 -
 vllm/utils.py                                 |  4 +-
 115 files changed, 240 insertions(+), 322 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index f90e464288cf1..7cf05610b9953 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@
 
 def read_markdown(file):
     if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
             return f.read() + "\n"
     else:
         return f"{file} not found.\n"
@@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving):
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         if "serving" in str(test_file):
             # this result is generated via `benchmark_serving.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_latency.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_throughput.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 6059588fe7277..052060c576300 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,7 +72,7 @@ def main(args):
 
     # collect results
     for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             results = results + json.loads(f.read())
 
     # generate markdown table
@@ -80,7 +80,7 @@ def main(args):
 
     md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
 
-    with open(args.description, "r") as f:
+    with open(args.description) as f:
         description = f.read()
 
     description = description.format(
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 4e4d4cd4ca3c6..92d6fad73a94c 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
             command = json.loads(f.read())
         raw_result.update(command)
 
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 18b354948f0cc..28d2e5fb8dbd9 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index f959a1cacf866..578c3fbd4e816 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
       fail-fast: false
       matrix:
           os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
           pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 197f918765e7d..edf98ce2fcab0 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -29,19 +29,19 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
-    - name: Analysing the code with ruff
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/ruff.json"
-        ruff check --output-format github .
-    - name: Run isort
-      run: |
-        isort . --check-only
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Analysing the code with ruff
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
+      - name: Run isort
+        run: |
+          isort . --check-only
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 35579302c5c14..4221c139ccf79 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -23,16 +23,16 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install yapf==0.32.0
-        pip install toml==0.10.2
-    - name: Running yapf
-      run: |
-        yapf --diff --recursive .
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install yapf==0.32.0
+          pip install toml==0.10.2
+      - name: Running yapf
+        run: |
+          yapf --diff --recursive .
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 42cbf18a0f712..34735700a224e 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,17 +6,16 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: '3.9'
 
 sphinx:
-   configuration: docs/source/conf.py
-   fail_on_warning: true
+  configuration: docs/source/conf.py
+  fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 
 # Optionally declare the Python requirements required to build your docs
 python:
-   install:
-   - requirements: docs/requirements-docs.txt
-
+  install:
+    - requirements: docs/requirements-docs.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 943424bc4edfa..c372ba98befbf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,9 +128,9 @@ endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
-  # For cuda we want to be able to control which architectures we compile for on 
+  # For cuda we want to be able to control which architectures we compile for on
   # a per-file basis in order to cut down on compile time. So here we extract
-  # the set of architectures we want to compile for and remove the from the 
+  # the set of architectures we want to compile for and remove the from the
   # CMAKE_CUDA_FLAGS so that they are not applied globally.
   #
   clear_cuda_arches(CUDA_ARCH_FLAGS)
@@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
   # Filter the target architectures by the supported supported archs
   # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS 
+  cuda_archs_loose_intersection(CUDA_ARCHS
     "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
   message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
 else()
@@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
   if (MARLIN_ARCHS)
-    set(MARLIN_SRCS 
+    set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
        "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                      "in CUDA target architectures")
     endif()
 
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
     # build any 3x kernels
     set(SCALED_MM_3X_ARCHS)
   endif()
@@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
     "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
     #
-    # For the Machete kernels we automatically generate sources for various 
+    # For the Machete kernels we automatically generate sources for various
     # preselected input type pairs and schedules.
     # Generate sources:
-    set(MACHETE_GEN_SCRIPT 
+    set(MACHETE_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
     file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
 
@@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
         OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
       execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env 
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
           ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
         RESULT_VARIABLE machete_generation_result
         OUTPUT_VARIABLE machete_generation_output
@@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
       if (NOT machete_generation_result EQUAL 0)
         message(FATAL_ERROR "Machete generation failed."
-                            " Result: \"${machete_generation_result}\"" 
+                            " Result: \"${machete_generation_result}\""
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
       else()
-        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} 
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
             CACHE STRING "Last run machete generate script hash" FORCE)
         message(STATUS "Machete generation completed successfully.")
       endif()
@@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
     message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
   else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
         AND MACHETE_ARCHS)
       message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -392,8 +392,8 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
   return()
 endif ()
 
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
-# arches in the CUDA case (and instead set the gencodes on a per file basis) 
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   foreach(_ARCH ${CUDA_ARCHS})
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0a903877f000d..a42e70170ba28 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -79,7 +79,7 @@ async def async_request_tgi(
                         # any data, we should skip it.
                         if chunk_bytes.startswith(":"):
                             continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")
 
                         data = json.loads(chunk)
                         timestamp = time.perf_counter()
@@ -144,8 +144,8 @@ async def async_request_trt_llm(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")
 
                         data = json.loads(chunk)
                         output.generated_text += data["text_output"]
@@ -261,8 +261,8 @@ async def async_request_openai_completions(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
@@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
     return output
 
 
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index b70c4b94c97a1..665b50bf18cf0 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -269,10 +269,10 @@ def run_square_bench(args):
 
 
 def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
     m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
+        (int(x) for x in args.dim_increment.split(","))
     Ms = list(range(m_start, m_end + 1, m_increment))
     Ks = list(range(k_start, k_end + 1, k_increment))
     Ns = list(range(n_start, n_end + 1, n_increment))
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index ebbe76cfb944a..d126af1849024 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -468,7 +468,7 @@ def generate():
     impl_configs = []
 
     GPTQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
             element_a=element_a,
             element_b=element_b,
             element_b_scale=element_a,
@@ -476,7 +476,7 @@ def generate():
             element_d=element_a,
             accumulator=DataType.f32,
         ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
 
     GPTQ_kernel_specializations = [
         Specialization(with_C=False, with_zeropoints=False, with_scales=True)
@@ -490,7 +490,7 @@ def generate():
     ]
 
     AWQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
             element_a=element_a,
             element_b=element_b,
             element_b_scale=element_a,
@@ -498,7 +498,7 @@ def generate():
             element_d=element_a,
             accumulator=DataType.f32,
         ) for element_b in (DataType.u4, DataType.u8)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
 
     AWQ_kernel_specializations = [
         Specialization(with_C=False, with_zeropoints=True, with_scales=True)
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index a706b285edede..61871cdf41125 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -10,7 +10,7 @@ Requirements
 ============
 
 * OS: Linux
-* Python: 3.8 - 3.12
+* Python: 3.9 -- 3.12
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 Install released versions
@@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 .. tip::
 
     Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
-    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` . 
+    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
     As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
 
@@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
     $ export MAX_JOBS=6
     $ pip install -e .
 
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. 
-A side effect is a much slower build process. 
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
+A side effect is a much slower build process.
 
 Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
 
@@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
 Unsupported OS build
 --------------------
 
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. 
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
 
 Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
 
diff --git a/pyproject.toml b/pyproject.toml
index 0bbab3cd3fbc3..3562569647391 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ select = [
     # Pyflakes
     "F",
     # pyupgrade
-    # "UP",
+    "UP",
     # flake8-bugbear
     "B",
     # flake8-simplify
@@ -55,7 +55,7 @@ ignore = [
 ]
 
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.9"
 
 ignore_missing_imports = true
 check_untyped_defs = true
diff --git a/setup.py b/setup.py
index 8abeb0ba739db..f145a33258d70 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,4 @@
 import importlib.util
-import io
 import logging
 import os
 import re
@@ -327,7 +326,7 @@ def get_neuronxcc_version():
                                 "__init__.py")
 
     # Check if the command was executed successfully
-    with open(version_file, "rt") as fp:
+    with open(version_file) as fp:
         content = fp.read()
 
     # Extract the version using a regular expression
@@ -404,7 +403,8 @@ def read_readme() -> str:
     """Read the README file if present."""
     p = get_path("README.md")
     if os.path.isfile(p):
-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+        with open(get_path("README.md"), encoding="utf-8") as f:
+            return f.read()
     else:
         return ""
 
@@ -498,7 +498,6 @@ def _read_requirements(filename: str) -> List[str]:
         "Documentation": "https://vllm.readthedocs.io/en/latest/",
     },
     classifiers=[
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
@@ -512,7 +511,7 @@ def _read_requirements(filename: str) -> List[str]:
     ],
     packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                     "tests*")),
-    python_requires=">=3.8",
+    python_requires=">=3.9",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
     extras_require={
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 9c65059c6b348..73fa9e9906936 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -429,8 +429,8 @@ def benchmark():
     # print in tabular format
     print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
     for b in cudagraph_sizes:
-        print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
-               f"\t{piecewise_cudagraph_time[b]:.3f}"))
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
 
 
 if __name__ == "__main__":
diff --git a/tests/conftest.py b/tests/conftest.py
index bdc6ffb148602..f9dfabc82639b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,5 @@
 import json
 import os
-import sys
 import tempfile
 from collections import UserList
 from enum import Enum
@@ -52,7 +51,7 @@
 
 
 def _read_prompts(filename: str) -> List[str]:
-    with open(filename, "r") as f:
+    with open(filename) as f:
         prompts = f.readlines()
         return prompts
 
@@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict):
     cherry_blossom: str
 
 
-if sys.version_info < (3, 9):
-    # UserList cannot be subscripted
-    class _ImageAssetsBase(UserList):
-        pass
-else:
-
-    class _ImageAssetsBase(UserList[ImageAsset]):
-        pass
+class _ImageAssetsBase(UserList[ImageAsset]):
+    pass
 
 
 class _ImageAssets(_ImageAssetsBase):
@@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict):
     sample_demo_1: str
 
 
-if sys.version_info < (3, 9):
-    # UserList cannot be subscripted
-    class _VideoAssetsBase(UserList):
-        pass
-else:
-
-    class _VideoAssetsBase(UserList[VideoAsset]):
-        pass
+class _VideoAssetsBase(UserList[VideoAsset]):
+    pass
 
 
 class _VideoAssets(_VideoAssetsBase):
@@ -958,7 +945,7 @@ def dummy_opt_path():
                               "*.msgpack"
                           ])
         assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
             config = json.load(f)
         config["architectures"] = ["MyOPTForCausalLM"]
         with open(json_path, "w") as f:
@@ -977,7 +964,7 @@ def dummy_llava_path():
                               "*.msgpack"
                           ])
         assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
             config = json.load(f)
         config["architectures"] = ["MyLlava"]
         with open(json_path, "w") as f:
@@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
                               "*.msgpack"
                           ])
         assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
             config = json.load(f)
         config["architectures"] = ["MyGemma2Embedding"]
         with open(json_path, "w") as f:
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 1a6e17ef7b445..d325b9606843e 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -99,13 +99,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int,
 
         token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
 
-        first_chain, second_chain = [
-            TestPrefixCachingBlock.create_chain(
-                block_size=block_size,
-                token_ids=token_ids,
-                num_empty_trailing_blocks=num_empty_trailing_blocks)
-            for _ in range(2)
-        ]
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            num_empty_trailing_blocks=num_empty_trailing_blocks)
+                                     for _ in range(2))
 
         for first_chain_block, second_chain_block in zip(
                 first_chain, second_chain):
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py
index ad05a97685351..19d1158c79c73 100644
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
         for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
     ]
     for i in range(len(seqlens[0])):
-        u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
+        u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
         if padded_state_indices[i] == PAD_SLOT_ID:
             continue
         out_ref_s, _ = selective_scan_ref(
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
index a01651b171d60..6ae8a6a704b0a 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
 # Sad path tests for the multimodal input processor and mapper, respectively
 @pytest.mark.parametrize("mm_data", [
     {
-        "image": torch.rand((5))
+        "image": torch.rand(5)
     },
     {
         "image": torch.rand((5, 5, 5, 5, 5))
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index a8deab3718be1..f5497976faf7a 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -413,12 +413,10 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler):
     def generate_probs_for_test(
         self, draft_and_target_probs_equal: bool
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        draft_probs, target_probs = [
-            F.softmax(
-                torch.rand(self.vocab_size, dtype=torch.float32),
-                dim=-1,
-            ) for _ in range(2)
-        ]
+        draft_probs, target_probs = (F.softmax(
+            torch.rand(self.vocab_size, dtype=torch.float32),
+            dim=-1,
+        ) for _ in range(2))
 
         num_reference_probs = 100
         reference_probs = F.softmax(
diff --git a/tests/test_logger.py b/tests/test_logger.py
index fadf66f2b61d4..a937b0812ed0c 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -29,7 +29,7 @@ def test_trace_function_call():
     cur_dir = os.path.dirname(__file__)
     enable_trace_function_call(path, cur_dir)
     f1(1)
-    with open(path, 'r') as f:
+    with open(path) as f:
         content = f.read()
 
     assert "f1" in content
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index a3e70a40db979..84348cbc0bced 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth):
 def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
     if "mistral" in tokenizer_name:
         yield (
-            bool(True) if request.param else
+            True if request.param else
             pytest.skip("mistral doesn't support skip_special_tokens=False"))
     else:
-        yield bool(True) if request.param else bool(False)
+        yield bool(request.param)
 
 
 @pytest.mark.parametrize("truth", TRUTH)
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index bbd24b085e3a7..081076ad7dbdc 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -46,7 +46,7 @@ def get_entries(node, curr_depth=0):
 
     args = parser.parse_args()
 
-    with open(args.json_trace, "r") as f:
+    with open(args.json_trace) as f:
         profile_data = json.load(f)
 
     if args.table == "summary":
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 65ee3ae108ae1..efd6beee865c2 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -434,7 +434,7 @@ def make_plot_title_suffix(profile_json: dict) -> str:
                 f"{', Sparsity ' + sparsity if sparsity else ''}")
 
     profile_json = None
-    with open(json_trace, "r") as f:
+    with open(json_trace) as f:
         profile_json = json.load(f)
     assert profile_json is not None
 
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 33431a33ac837..51ad2adc74fe1 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -81,7 +81,7 @@ def WeightedDuration(self):
         # Allow for modest floating-point errors
         epsilon = 0.000002
         if (self.weighted_duration > self.Duration() + epsilon):
-            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
         assert (self.weighted_duration <= self.Duration() + epsilon)
         return self.weighted_duration
 
@@ -104,7 +104,7 @@ def ReadTargets(log, show_all):
     The result is a list of Target objects."""
     header = log.readline()
     assert header == '# ninja log v5\n', \
-           'unrecognized ninja log version %r' % header
+           'unrecognized ninja log version {!r}'.format(header)
     targets_dict = {}
     last_end_seen = 0.0
     for line in log:
@@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types):
     # Warn if the sum of weighted times is off by more than half a second.
     if abs(length - weighted_total) > 500:
         print('Warning: Possible corrupt ninja log, results may be '
-              'untrustworthy. Length = %.3f, weighted total = %.3f' %
-              (length, weighted_total))
+              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
+                  length, weighted_total))
 
     entries_by_ext = defaultdict(list)
     for target in entries:
@@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types):
         entries_by_ext[extension].append(target)
 
     for key, values in entries_by_ext.items():
-        print('    Longest build steps for %s:' % key)
+        print('    Longest build steps for {}:'.format(key))
         values.sort(key=lambda x: x.WeightedDuration())
         for target in values[-long_count:]:
-            print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
-                  (target.WeightedDuration(), target.DescribeTargets(),
-                   target.Duration()))
-
-    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
-          'parallelism)' %
-          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+            print(
+                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
+                format(target.WeightedDuration(), target.DescribeTargets(),
+                       target.Duration()))
+
+    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
+          'parallelism)'.format(length, total_cpu_time,
+                                total_cpu_time * 1.0 / length))
     print('    %d build steps completed, average of %1.2f/s' %
           (len(entries), len(entries) / (length)))
 
@@ -298,11 +299,12 @@ def main():
         long_ext_count += len(args.step_types.split(';'))
 
     try:
-        with open(log_file, 'r') as log:
+        with open(log_file) as log:
             entries = ReadTargets(log, False)
             SummarizeEntries(entries, args.step_types)
-    except IOError:
-        print('Log file %r not found, no build summary created.' % log_file)
+    except OSError:
+        print('Log file {!r} not found, no build summary created.'.format(
+            log_file))
         return errno.ENOENT
 
 
diff --git a/use_existing_torch.py b/use_existing_torch.py
index e11746459908b..319d262898fe3 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -4,7 +4,7 @@
 requires_files += ["pyproject.toml"]
 for file in requires_files:
     print(f">>> cleaning {file}")
-    with open(file, 'r') as f:
+    with open(file) as f:
         lines = f.readlines()
     if "torch" in "".join(lines).lower():
         print("removed:")
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index a98eb431ac7fc..350f88c8f9740 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -192,10 +192,8 @@ def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
         attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
 
         q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
-        k2, v2 = [
-            self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
-            for x in [k, v]
-        ]
+        k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
+                  for x in [k, v])
         spda_output = torch.nn.functional.scaled_dot_product_attention(
             q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
         return self.transpose_and_unpad(spda_output, cu_seqlens)
diff --git a/vllm/config.py b/vllm/config.py
index 814e00c8785f0..851d35dfd9fb0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -668,9 +668,10 @@ def get_multimodal_config(self) -> "MultiModalConfig":
     @property
     def is_encoder_decoder_model(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
-        return getattr(self.hf_config, "is_encoder_decoder", False) or (
-            (hasattr(self.hf_config, "text_config") and getattr(
-                self.hf_config.text_config, "is_encoder_decoder", False)))
+        return getattr(
+            self.hf_config, "is_encoder_decoder",
+            False) or (hasattr(self.hf_config, "text_config") and getattr(
+                self.hf_config.text_config, "is_encoder_decoder", False))
 
     @property
     def is_multimodal_model(self) -> bool:
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 0b943e6e65f1c..ed7e06cab2996 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -52,7 +52,7 @@ def num_blocks(self) -> int:
         pass
 
 
-class BlockMetaData():
+class BlockMetaData:
     """Data structure for storing key data describe cached block, so that
     evitor could use to make its decision which one to choose for eviction
 
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 983e772a3f79b..1f78e10cc1dcd 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
     if is_distributed:
         get_world_group().barrier()
     logger.info("reading GPU P2P access cache from %s", path)
-    with open(path, "r") as f:
+    with open(path) as f:
         cache = json.load(f)
     _gpu_p2p_access_cache = cache
     return _gpu_p2p_access_cache[f"{src}->{tgt}"]
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index b0fdc67776bbd..161b85646b6e8 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -812,7 +812,7 @@ async def _engine_abort(self, request_ids: Iterable[str]):
     async def run_engine_loop(engine_ref: ReferenceType):
         """We use a weakref to the engine so that the running loop
         doesn't prevent the engine being garbage collected."""
-        engine: Optional["AsyncLLMEngine"] = engine_ref()
+        engine: Optional[AsyncLLMEngine] = engine_ref()
         if not engine:
             return
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a1809b1a9dd26..404e7ed2c6ef9 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1541,8 +1541,8 @@ def _has_remaining_steps(
                 seq_group.state.remaining_steps != ref_remaining_steps
                 for seq_group in seq_group_metadata_list[1:]
         ]):
-            raise AssertionError(("All running sequence groups should "
-                                  "have the same remaining steps."))
+            raise AssertionError("All running sequence groups should "
+                                 "have the same remaining steps.")
 
         return ref_remaining_steps > 0
 
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 25b7a7479672a..19dcbfe57d112 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -77,7 +77,7 @@ def __init__(self, local_interval: float) -> None:
         self.num_generation_tokens: List[int] = []
         self.last_local_log = time.time()
         self.local_interval = local_interval
-        self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+        self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
 
     @abstractmethod
     def log(self, stats: Stats) -> None:
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 3ed37a269c4b4..223790806ab18 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -63,7 +63,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
             single_step_process_prompt_logprob(self, seq_group, output)
 
     @staticmethod
-    @functools.lru_cache()
+    @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
         # Reminder: Please update docs/source/serving/compatibility_matrix.rst
         # If the feature combo become valid
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 2b339ab6d44e4..0ada0aaacda24 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -362,7 +362,7 @@ def load_chat_template(
     if chat_template is None:
         return None
     try:
-        with open(chat_template, "r") as f:
+        with open(chat_template) as f:
             resolved_chat_template = f.read()
     except OSError as e:
         if isinstance(chat_template, Path):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index a64467a311523..0d016d949d22b 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str:
                    session.get(path_or_url) as resp:
             return await resp.text()
     else:
-        with open(path_or_url, "r", encoding="utf-8") as f:
+        with open(path_or_url, encoding="utf-8") as f:
             return f.read()
 
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 9433dce842b09..66bab2c686c67 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
diff --git a/vllm/logger.py b/vllm/logger.py
index ccf09691a052a..d6fcda02a0fb3 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None:
             raise RuntimeError(
                 "Could not load logging config. File does not exist: %s",
                 VLLM_LOGGING_CONFIG_PATH)
-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
-                  mode="r") as file:
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
             custom_config = json.loads(file.read())
 
         if not isinstance(custom_config, dict):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index d0279f273db7a..81e274612b73b 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -343,7 +343,7 @@ def __init__(
             # text modules (e.g. ChatGLM)
             and hasattr(self.model, "get_mm_mapping"))
         self.packed_modules: Dict[str, List[str]] = {}
-        self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
+        self.modules: Dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a Set for compatibility with LRUCache.
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
@@ -548,7 +548,7 @@ def create_dummy_lora(
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
-                subloras: List[Optional["LoRALayerWeights"]] = []
+                subloras: List[Optional[LoRALayerWeights]] = []
                 for i, r in enumerate(replacements):
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name + "." + r,
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 764f4e9c99df8..bfca15c2b6a3e 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -103,7 +103,7 @@ def enabled(cls) -> bool:
     # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
     # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
     @staticmethod
-    @lru_cache()
+    @lru_cache
     def default_on() -> bool:
         count_none = envs.VLLM_CUSTOM_OPS.count("none")
         count_all = envs.VLLM_CUSTOM_OPS.count("all")
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index bca44d2bf2e28..aae806f6af323 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 2158ad3339673..ac60e0e6d48a0 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 1f8d531198324..464915248c9ad 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -746,7 +746,7 @@ def __init__(self, load_config: LoadConfig):
 
         config_file_path = self._get_config_file(qlora_adapter)
 
-        with open(config_file_path, "r") as f:
+        with open(config_file_path) as f:
             config = json.load(f)
             self.target_modules = config["target_modules"]
 
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index 573f2a04895d9..e6299295c85a2 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -190,7 +190,7 @@ def get_model(
     kv_cache_dtype: ov.Type,
     **kwargs,
 ) -> torch.nn.Module:
-    lora_config = kwargs.get("lora_config", None)
+    lora_config = kwargs.get("lora_config")
     ov_core = kwargs.get("ov_core")
     if lora_config:
         raise ValueError(
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 36f33d6d139ee..437d2772e1f28 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -280,7 +280,7 @@ def __init__(self, tensorizer_config: TensorizerConfig,
         self.tensorizer_args = (
             self.tensorizer_config._construct_tensorizer_args())
         self.extra_kwargs = extra_kwargs
-        if extra_kwargs.get("quant_config", None) is not None:
+        if extra_kwargs.get("quant_config") is not None:
             self.quant_config = extra_kwargs["quant_config"]
         else:
             self.quant_config = quant_config
@@ -380,8 +380,7 @@ def tensorizer_weights_iterator(
     stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
     with TensorDeserializer(stream, **deserializer_args,
                             device="cpu") as state:
-        for name, param in state.items():
-            yield name, param
+        yield from state.items()
     del state
 
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0c51314bc90df..9488d54edf365 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig,
             f"{quant_config_files}")
 
     quant_config_file = quant_config_files[0]
-    with open(quant_config_file, "r") as f:
+    with open(quant_config_file) as f:
         config = json.load(f)
 
         if model_config.quantization == "bitsandbytes":
@@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str],
 
     # Iterate through the weight_map (weight_name: safetensors files)
     # to identify weights that we should use.
-    with open(index_file_name, "r") as f:
+    with open(index_file_name) as f:
         weight_map = json.load(f)["weight_map"]
     weight_files_in_index = set()
     for weight_name in weight_map:
@@ -382,7 +382,7 @@ def np_cache_weights_iterator(
             with open(weight_names_file, "w") as f:
                 json.dump(weight_names, f)
 
-    with open(weight_names_file, "r") as f:
+    with open(weight_names_file) as f:
         weight_names = json.load(f)
 
     for name in weight_names:
@@ -423,8 +423,7 @@ def pt_weights_iterator(
             bar_format=_BAR_FORMAT,
     ):
         state = torch.load(bin_file, map_location="cpu")
-        for name, param in state.items():
-            yield name, param
+        yield from state.items()
         del state
         torch.cuda.empty_cache()
 
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index fd29d4ccc59d8..5b712ba83c25a 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -48,7 +48,7 @@ def __init__(self,
                  is_residual_mlp: bool = False,
                  quant_config: Optional[QuantizationConfig] = None,
                  reduce_results: bool = True):
-        super(ArcticMLP, self).__init__()
+        super().__init__()
         self.hidden_size = config.hidden_size
         self.expert_id = expert_id
         self.layer_id = layer_id
@@ -89,7 +89,7 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  reduce_results: bool = True):
-        super(ArcticMoE, self).__init__()
+        super().__init__()
 
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
         self.hidden_size = config.hidden_size
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index f2cfdf8ffd30a..1fbf4135add7a 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 77ab7de6165fb..83ff39a30fbe3 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 181f3c2b0fc35..881b86564e811 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only ChatGLM model compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 348e6d20f3297..835682ca3b379 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index aae7ab7370b74..3e60eee2d8fe2 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index 7ed2b96e65c49..8c9653463858b 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 DeciAI Research Team. All rights reserved.
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 5b4db8f258711..d278ea5b6a991 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d4ad0c6b5c99e..834be78bce87b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 22f194c776b69..23efe0359cb4a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
 # Copyright 2024 The LG U+ CTO AI Tech Lab.
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index c376347811965..ad07fc3b3776e 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 0de590d1d8372..3db82a898159b 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 029178af61da0..fc3f5cb20afb0 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
 #
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 9238ed839c9de..c365880109ef8 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 3213a8b29a104..025615b0920fd 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 3330d84021368..a06200c4b7e08 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 24c79a8855475..7612ea641d95c 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 9a42b359ae44f..b28a6081b868f 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 1bccef7a5f173..931052c7cccf0 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index c968817747754..bee48f377e0f5 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 5307bb21adb96..691a6e77c46c4 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 43f4f29814e6d..53869b8fa6bd8 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -1,5 +1,3 @@
-# coding=utf-8
-
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 313d98b649b48..afefb6cd9fa96 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index edd867e4b6457..108fc8382049d 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index b947f24a693b5..301893f74cb87 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6f7949c880e61..81d88a47c1941 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 """Inference-only Jamba model."""
 from typing import Iterable, List, Optional, Tuple
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 38a31f420cec9..6c0a8b5ef8451 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 985ba6f3c60c1..aac4b7aa2661d 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 """PyTorch MAMBA model."""
 from typing import Iterable, List, Optional, Tuple
 
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 03fb036020f2f..acf03cd8cb8ad 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 3b5fd95328d74..eeedf55cf3e57 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2024 The ModelBest team.
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index f90df6b7df036..5acd3f65896c7 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 1514243ad59c9..e9b9c4d838faa 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 63e2c60a84271..9647d69be8a0a 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 251bfc079684e..5fa8d19b97fe8 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 42ccd01298169..ae218d749fc0b 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -37,7 +37,7 @@ def __init__(
         eps=1e-06,
         elementwise_scale_and_shift=True,
     ):
-        super(MLPSpeculatorLayerNorm, self).__init__()
+        super().__init__()
         self.elementwise_scale_and_shift = elementwise_scale_and_shift
         if self.elementwise_scale_and_shift:
             self.weight = nn.Parameter(torch.empty(normalized_shape))
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 522aa748f78b6..785b53670542f 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1121,9 +1121,9 @@ def _merge_multimodal_embeddings(
             batch_size * num_image * num_patch, -1).contiguous()
 
         image_input_idx = image_input_idx * valid.to(image_input_idx.dtype)
-        offset = torch.cat(
-            [seq_len.new_zeros(
-                (1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None]
+        offset = torch.cat([seq_len.new_zeros(1),
+                            seq_len.cumsum(dim=0)[:-1]],
+                           dim=0)[:, None]
         image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
         image_input_idx = image_input_idx.flatten()[:, None]
         mat = image_input_idx == torch.arange(
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index ee802030a5ef3..fdd8af79b5470 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
 from typing import Iterable, List, Optional, Tuple, Union
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 72a09129fed63..b649064536dc2 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 90ab8abcb84b4..dd3f58289a227 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 7521ab749e10f..7a76e4a0906db 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 055407587c598..a338a93c2dd9a 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
 # Copyright (c) OrionStar Inc.
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index fc9ef15db26c0..bd4a9f698bacd 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 4e7935a7636c5..492122450b237 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 02b2ff01c3832..34141511ea791 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5b477a8ed5f49..1c41891ced416 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index bb8a9327b4ac8..59843ae3dfd59 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index ee9f150b17cfc..6e9092432467a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -136,11 +136,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
 
         if image_token_id not in inputs['prompt_token_ids']:
             raise ValueError(
-                (f"You've passed {inputs=} without {image_token_id=}"
-                 " Make sure to process your input via mistral_common's"
-                 " tokenizer or pass a chat completion request. For more"
-                 " For more info, see: "
-                 "https://github.com/vllm-project/vllm/issues/8411."))
+                f"You've passed {inputs=} without {image_token_id=}"
+                " Make sure to process your input via mistral_common's"
+                " tokenizer or pass a chat completion request. For more"
+                " For more info, see: "
+                "https://github.com/vllm-project/vllm/issues/8411.")
 
     return inputs
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index b2b5c70182135..3a0e33e8a3eff 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
 # Copyright (c) Alibaba Cloud.
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 72b286fe6f6d6..49b3de1304cca 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
 # Copyright 2024 The Qwen team.
@@ -417,9 +416,9 @@ def __init__(
                 and hasattr(config, "max_window_layers")):
             raise ValueError("Sliding window for some but all layers is not "
                              "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
-                             "to discuss this feature." % (
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
                                  config.max_window_layers,
                                  config.num_hidden_layers,
                              ))
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 6114548bda42c..556c09400ee83 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 2d6f3e90f761c..b9e3b74c477e2 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 Kakao Corp. (Kanana-X Team)
@@ -60,9 +59,9 @@ def __init__(
                 and hasattr(config, "max_window_layers")):
             raise ValueError("Sliding window for some but all layers is not "
                              "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
-                             "to discuss this feature." % (
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
                                  config.max_window_layers,
                                  config.num_hidden_layers,
                              ))
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index dac85e35d369d..98bb48a274e49 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 901b1daaa14a4..0fbf305da8b94 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 The Qwen team.
@@ -71,9 +70,9 @@ def __init__(
                 and hasattr(config, "max_window_layers")):
             raise ValueError("Sliding window for some but all layers is not "
                              "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
-                             "to discuss this feature." % (
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
                                  config.max_window_layers,
                                  config.num_hidden_layers,
                              ))
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index d801903f8f9fe..e30b84e8dd44c 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
 # Copyright 2024 The Qwen team.
@@ -246,9 +245,8 @@ def forward(
         q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
         batch_size = q.shape[1]
 
-        q, k, v = [
-            rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
-        ]
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
         if rotary_pos_emb is not None:
             q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
@@ -258,7 +256,7 @@ def forward(
             #   flash_attn_varlen_func)
             from flash_attn import flash_attn_varlen_func
 
-            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             output = flash_attn_varlen_func(q,
@@ -276,7 +274,7 @@ def forward(
                                       b=batch_size)
         elif self.attn_backend == _Backend.TORCH_SDPA:
             seq_length = q.size(1)
-            q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]]
+            q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
             attention_mask = torch.zeros([1, seq_length, seq_length],
                                          device=q.device,
                                          dtype=torch.bool)
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index e3e7ccb5cf179..1b233ac7427dd 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 083a48588d01a..34389b645a7c1 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
 # All rights reserved.
 #
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 8f0644bca3e2e..b24c5dadb2b2b 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 036789642d3c4..e559988ada753 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 6b10d0c609f13..5ff6f93fb25b4 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,4 +1,3 @@
-import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping,
@@ -34,14 +33,9 @@
 :meth:`MultiModalInputs.batch`.
 """
 
-if sys.version_info < (3, 9):
-    # UserDict cannot be subscripted
-    class _MultiModalInputsBase(UserDict):
-        pass
-else:
 
-    class _MultiModalInputsBase(UserDict[str, NestedTensors]):
-        pass
+class _MultiModalInputsBase(UserDict[str, NestedTensors]):
+    pass
 
 
 class MultiModalInputs(_MultiModalInputsBase):
@@ -262,18 +256,23 @@ def wrapper(model_cls: N) -> N:
                 logger.warning(
                     "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
+                    model_cls,
+                    self,
+                )
 
-            self._input_mappers[model_cls] = mapper \
-                or self._default_input_mapper
+            self._input_mappers[model_cls] = (mapper
+                                              or self._default_input_mapper)
 
             return model_cls
 
         return wrapper
 
-    def map_input(self, model_config: "ModelConfig",
-                  data: MultiModalData[object],
-                  mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs:
+    def map_input(
+        self,
+        model_config: "ModelConfig",
+        data: MultiModalData[object],
+        mm_processor_kwargs: Dict[str, Any],
+    ) -> MultiModalInputs:
         """
         Transform the data into a dictionary of model inputs using the
         input mapper registered for that model.
@@ -348,13 +347,15 @@ def wrapper(model_cls: N) -> N:
                 logger.warning(
                     "Model class %s already calculates maximum number of "
                     "tokens in %s. It is overwritten by the new one.",
-                    model_cls, self)
+                    model_cls,
+                    self,
+                )
 
             if isinstance(max_mm_tokens, int):
                 self._validate_max_multimodal_tokens(max_mm_tokens)
 
-            self._max_mm_tokens[model_cls] = max_mm_tokens \
-                or self._default_max_multimodal_tokens
+            self._max_mm_tokens[model_cls] = (
+                max_mm_tokens or self._default_max_multimodal_tokens)
 
             return model_cls
 
@@ -482,8 +483,10 @@ def from_seq_group(
         placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict(
             MultiModalPlaceholderMap)
 
-        for modality, placeholders in seq_group.multi_modal_placeholders.items(
-        ):
+        for (
+                modality,
+                placeholders,
+        ) in seq_group.multi_modal_placeholders.items():
             mm_items = mm_data.pop(modality)
             if not isinstance(mm_items, list):
                 mm_items = [mm_items]
@@ -499,8 +502,11 @@ def from_seq_group(
         return mm_data, placeholder_maps
 
     def append_items_from_seq_group(
-            self, positions: range, multi_modal_items: List[_T],
-            multi_modal_placeholders: List[PlaceholderRange]) -> List[_T]:
+        self,
+        positions: range,
+        multi_modal_items: List[_T],
+        multi_modal_placeholders: List[PlaceholderRange],
+    ) -> List[_T]:
         """
         Adds the multi-modal items that intersect ```positions`` to this
         placeholder map and returns the intersecting items.
@@ -515,20 +521,26 @@ def append_items_from_seq_group(
                                              multi_modal_items):
             placeholder = range(
                 placeholder_dict["offset"],
-                placeholder_dict["offset"] + placeholder_dict["length"])
-            intersection = range(max(positions.start, placeholder.start),
-                                 min(positions.stop, placeholder.stop))
+                placeholder_dict["offset"] + placeholder_dict["length"],
+            )
+            intersection = range(
+                max(positions.start, placeholder.start),
+                min(positions.stop, placeholder.stop),
+            )
 
             if not intersection:
                 # Skip this multi-modal item.
                 continue
 
-            token_embedding_range = range(intersection.start - positions.start,
-                                          intersection.stop - positions.start)
+            token_embedding_range = range(
+                intersection.start - positions.start,
+                intersection.stop - positions.start,
+            )
 
             multimodal_embedding_range = range(
                 intersection.start - placeholder.start + self.src_len,
-                intersection.stop - placeholder.start + self.src_len)
+                intersection.stop - placeholder.start + self.src_len,
+            )
 
             intersecting_items.append(mm_item)
             self.dest_ranges.append(token_embedding_range)
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index 4cde2a0254b90..473b87c89c21d 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -37,9 +37,8 @@ def load_peft_weights(model_id: str,
             Additional arguments to pass to the `hf_hub_download` method when 
             loading from the HuggingFace Hub.
     """
-    path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"])
-            if hf_hub_download_kwargs.get("subfolder", None) is not None else
-            model_id)
+    path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if
+            hf_hub_download_kwargs.get("subfolder") is not None else model_id)
 
     if device is None:
         device = infer_device()
@@ -51,19 +50,19 @@ def load_peft_weights(model_id: str,
         filename = os.path.join(path, WEIGHTS_NAME)
         use_safetensors = False
     else:
-        token = hf_hub_download_kwargs.get("token", None)
+        token = hf_hub_download_kwargs.get("token")
         if token is None:
-            token = hf_hub_download_kwargs.get("use_auth_token", None)
+            token = hf_hub_download_kwargs.get("use_auth_token")
 
         hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
                                      SAFETENSORS_WEIGHTS_NAME)
-                        if hf_hub_download_kwargs.get("subfolder", None)
-                        is not None else SAFETENSORS_WEIGHTS_NAME)
+                        if hf_hub_download_kwargs.get("subfolder") is not None
+                        else SAFETENSORS_WEIGHTS_NAME)
         has_remote_safetensors_file = file_exists(
             repo_id=model_id,
             filename=hub_filename,
-            revision=hf_hub_download_kwargs.get("revision", None),
-            repo_type=hf_hub_download_kwargs.get("repo_type", None),
+            revision=hf_hub_download_kwargs.get("revision"),
+            repo_type=hf_hub_download_kwargs.get("repo_type"),
             token=token,
         )
         use_safetensors = has_remote_safetensors_file
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 08697274854e0..1a5870aa4f84c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -308,7 +308,7 @@ def load_params_config(model, revision) -> PretrainedConfig:
         config_path = Path(
             hf_hub_download(model, config_file_name, revision=revision))
 
-    with open(config_path, "r") as file:
+    with open(config_path) as file:
         config_dict = json.load(file)
 
     config_mapping = {
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index 49d2b8d8e21b1..e563bf6268d72 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 from transformers import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index 805b8ad930039..f60a59f554133 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copied from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
 # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index b06a946f34a47..82f129eb2018e 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 # Copyright 2023 Cerebras Systems.
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 497db0ae48c96..0f047c8b0361c 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copied from
 # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
 """A HuggingFace-style model configuration."""
@@ -117,10 +116,10 @@ def _validate_config(self) -> None:
                                                      init_config_defaults)
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
-        if any((
+        if any(
                 prob < 0 or prob > 1 for prob in
-            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
-        )):
+            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
+             ]):
             raise ValueError(
                 "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
                 "probabilities and must be between 0 and 1")
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 139e6b3cdacbe..93fec667d1cf3 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
@@ -144,7 +143,7 @@ def __init__(
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        head_dim = head_dim or kwargs.get("kv_channels", None)
+        head_dim = head_dim or kwargs.get("kv_channels")
         self.head_dim = head_dim if head_dim is not None else (
             hidden_size // num_attention_heads)
 
@@ -160,8 +159,8 @@ def __init__(
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         # for backward compatibility
-        partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get(
-            "rope_percentage", None) or partial_rotary_factor
+        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
+            "rope_percentage") or partial_rotary_factor
         self.partial_rotary_factor = partial_rotary_factor
         self._rope_scaling_validation()
         self.attention_bias = attention_bias
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
index d5113bf01695a..0c1c048f670ee 100644
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/utils.py b/vllm/utils.py
index 0b75e8761c916..6edc8d72f6bcf 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1153,7 +1153,7 @@ class SortedHelpFormatter(argparse.HelpFormatter):
 
     def add_arguments(self, actions):
         actions = sorted(actions, key=lambda x: x.option_strings)
-        super(SortedHelpFormatter, self).add_arguments(actions)
+        super().add_arguments(actions)
 
 
 class FlexibleArgumentParser(argparse.ArgumentParser):
@@ -1279,7 +1279,7 @@ def _load_config_file(self, file_path: str) -> List[str]:
 
         config: Dict[str, Union[int, str]] = {}
         try:
-            with open(file_path, 'r') as config_file:
+            with open(file_path) as config_file:
                 config = yaml.safe_load(config_file)
         except Exception as ex:
             logger.error(

From a5fda50a10641e47c0c290907f30ef2add6d4e7a Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 6 Nov 2024 16:50:37 +0800
Subject: [PATCH 82/85] [CI/Build] Fix large_gpu_mark reason (#10070)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils.py b/tests/utils.py
index 16e21f68c7c96..00c7dabe16a7b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -699,7 +699,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
 
     return pytest.mark.skipif(
         memory_gb < min_gb,
-        reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
+        reason=f"Need at least {min_gb}GB GPU memory to run the test.",
     )
 
 

From a02a50e6e5bb74f9d48f75942e47197d22ec6444 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 6 Nov 2024 10:09:10 +0100
Subject: [PATCH 83/85] [Hardware][Intel-Gaudi] Add Intel Gaudi (HPU) inference
 backend (#6143)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Signed-off-by: Bob Zhu <bob.zhu@intel.com>
Signed-off-by: zehao-intel <zehao.huang@intel.com>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Co-authored-by: Michal Adamczyk <madamczyk@habana.ai>
Co-authored-by: Marceli Fylcek <mfylcek@habana.ai>
Co-authored-by: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com>
Co-authored-by: Vivek Goel <vgoel@habana.ai>
Co-authored-by: yuwenzho <yuwen.zhou@intel.com>
Co-authored-by: Dominika Olszewska <dolszewska@habana.ai>
Co-authored-by: barak goldberg <149692267+bgoldberg-habana@users.noreply.github.com>
Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com>
Co-authored-by: Jan Kaniecki <jkaniecki@habana.ai>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyniewicz-habana@users.noreply.github.com>
Co-authored-by: Krzysztof Wisniewski <kwisniewski@habana.ai>
Co-authored-by: Dudi Lester <160421192+dudilester@users.noreply.github.com>
Co-authored-by: Ilia Taraban <tarabanil@gmail.com>
Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Michał Kuligowski <mkuligowski@habana.ai>
Co-authored-by: Jakub Maksymczuk <jmaksymczuk@habana.ai>
Co-authored-by: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com>
Co-authored-by: Sun Choi <schoi@habana.ai>
Co-authored-by: Iryna Boiko <iboiko@habana.ai>
Co-authored-by: Bob Zhu <41610754+czhu15@users.noreply.github.com>
Co-authored-by: hlin99 <73271530+hlin99@users.noreply.github.com>
Co-authored-by: Zehao Huang <zehao.huang@intel.com>
Co-authored-by: Andrzej Kotłowski <Andrzej.Kotlowski@intel.com>
Co-authored-by: Yan Tomsinsky <73292515+Yantom1@users.noreply.github.com>
Co-authored-by: Nir David <ndavid@habana.ai>
Co-authored-by: Yu-Zhou <yu.zhou@intel.com>
Co-authored-by: Ruheena Suhani Shaik <rsshaik@habana.ai>
Co-authored-by: Karol Damaszke <kdamaszke@habana.ai>
Co-authored-by: Marcin Swiniarski <mswiniarski@habana.ai>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Jacek Czaja <jacek.czaja@intel.com>
Co-authored-by: Jacek Czaja <jczaja@habana.ai>
Co-authored-by: Yuan <yuan.zhou@outlook.com>
---
 Dockerfile.hpu                                |   16 +
 .../getting_started/gaudi-installation.rst    |  402 ++++
 docs/source/index.rst                         |    3 +-
 requirements-hpu.txt                          |   11 +
 setup.py                                      |   47 +-
 vllm/_custom_ops.py                           |    2 +-
 vllm/attention/backends/hpu_attn.py           |  264 +++
 vllm/attention/ops/hpu_paged_attn.py          |  103 +
 vllm/attention/selector.py                    |    8 +
 vllm/config.py                                |   22 +-
 vllm/core/block/cpu_gpu_block_allocator.py    |    7 +-
 .../device_communicators/hpu_communicator.py  |   48 +
 vllm/distributed/parallel_state.py            |   19 +
 vllm/engine/arg_utils.py                      |   11 +-
 vllm/engine/async_llm_engine.py               |    8 +
 vllm/engine/llm_engine.py                     |    8 +
 vllm/executor/hpu_executor.py                 |  205 ++
 vllm/executor/ray_hpu_executor.py             |  554 +++++
 vllm/executor/ray_utils.py                    |    6 +-
 vllm/model_executor/custom_op.py              |    5 +-
 vllm/model_executor/layers/layernorm.py       |   19 +
 .../model_executor/layers/logits_processor.py |   10 +-
 .../model_executor/layers/rotary_embedding.py |   55 +
 .../layers/vocab_parallel_embedding.py        |   17 +-
 vllm/model_executor/sampling_metadata.py      |    3 +-
 vllm/platforms/__init__.py                    |   10 +
 vllm/platforms/hpu.py                         |   11 +
 vllm/platforms/interface.py                   |    4 +
 vllm/utils.py                                 |    3 +
 vllm/worker/hpu_model_runner.py               | 2008 +++++++++++++++++
 vllm/worker/hpu_worker.py                     |  410 ++++
 31 files changed, 4279 insertions(+), 20 deletions(-)
 create mode 100644 Dockerfile.hpu
 create mode 100644 docs/source/getting_started/gaudi-installation.rst
 create mode 100644 requirements-hpu.txt
 create mode 100644 vllm/attention/backends/hpu_attn.py
 create mode 100644 vllm/attention/ops/hpu_paged_attn.py
 create mode 100644 vllm/distributed/device_communicators/hpu_communicator.py
 create mode 100644 vllm/executor/hpu_executor.py
 create mode 100644 vllm/executor/ray_hpu_executor.py
 create mode 100644 vllm/platforms/hpu.py
 create mode 100644 vllm/worker/hpu_model_runner.py
 create mode 100644 vllm/worker/hpu_worker.py

diff --git a/Dockerfile.hpu b/Dockerfile.hpu
new file mode 100644
index 0000000000000..f481c8c6a57bf
--- /dev/null
+++ b/Dockerfile.hpu
@@ -0,0 +1,16 @@
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
new file mode 100644
index 0000000000000..68c1a56660fa4
--- /dev/null
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -0,0 +1,402 @@
+Installation with Intel® Gaudi® AI Accelerators
+===============================================
+
+This README provides instructions on running vLLM with Intel Gaudi devices.
+
+Requirements and Installation
+=============================
+
+Please follow the instructions provided in the `Gaudi Installation
+Guide <https://docs.habana.ai/en/latest/Installation_Guide/index.html>`__
+to set up the execution environment. To achieve the best performance,
+please follow the methods outlined in the `Optimizing Training Platform
+Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
+
+Requirements
+------------
+
+-  OS: Ubuntu 22.04 LTS
+-  Python: 3.10
+-  Intel Gaudi accelerator
+-  Intel Gaudi software version 1.18.0
+
+
+Quick start using Dockerfile
+----------------------------
+.. code:: console
+
+   $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+
+
+.. tip::
+   If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation <https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html>`__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered.
+
+
+Build from source
+-----------------
+
+Environment verification
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+.. code:: console
+
+   $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+   $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+   $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+   $ pip list | grep neural # verify that neural_compressor is installed
+
+Refer to `Intel Gaudi Software Stack
+Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade>`__
+for more details.
+
+Run Docker Image
+~~~~~~~~~~~~~~~~
+
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the `Intel Gaudi
+documentation <https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers>`__
+for more details.
+
+Use the following commands to run a Docker image:
+
+.. code:: console
+
+   $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+Build and Install vLLM
+~~~~~~~~~~~~~~~~~~~~~~
+
+To build and install vLLM from source, run:
+
+.. code:: console
+
+   $ git clone https://github.com/vllm-project/vllm.git
+   $ cd vllm
+   $ python setup.py develop
+
+
+Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__, run the following:
+
+.. code:: console
+
+   $ git clone https://github.com/HabanaAI/vllm-fork.git
+   $ cd vllm-fork
+   $ git checkout habana_main
+   $ python setup.py develop
+
+
+Supported Features
+==================
+
+-  `Offline batched
+   inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
+-  Online inference via `OpenAI-Compatible
+   Server <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server>`__
+-  HPU autodetection - no need to manually select device within vLLM
+-  Paged KV cache with algorithms enabled for Intel Gaudi accelerators
+-  Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+   prefill attention, Root Mean Square Layer Normalization, Rotary
+   Positional Encoding
+-  Tensor parallelism support for multi-card inference
+-  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
+   for accelerating low-batch latency and throughput
+-  Attention with Linear Biases (ALiBi)
+
+Unsupported Features
+====================
+
+-  Beam search
+-  LoRA adapters
+-  Quantization
+-  Prefill chunking (mixed-batch inferencing)
+
+Supported Configurations
+========================
+
+The following configurations have been validated to be function with
+Gaudi2 devices. Configurations that are not listed may or may not work.
+
+-  `meta-llama/Llama-2-7b <https://huggingface.co/meta-llama/Llama-2-7b>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Llama-2-7b-chat-hf <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-8B <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Llama-2-70b <https://huggingface.co/meta-llama/Llama-2-70b>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Llama-2-70b-chat-hf <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-70B <https://huggingface.co/meta-llama/Meta-Llama-3-70B>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-70B <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+
+Performance Tuning
+==================
+
+Execution modes
+---------------
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.  
+
+.. list-table:: vLLM execution modes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - ``PT_HPU_LAZY_MODE``
+     - ``enforce_eager`` 
+     - execution mode
+   * - 0
+     - 0
+     - torch.compile
+   * - 0
+     - 1
+     - PyTorch eager mode
+   * - 1
+     - 0
+     - HPU Graphs
+   * - 1
+     - 1
+     - PyTorch lazy mode
+
+.. warning::
+   In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+
+
+Bucketing mechanism
+-------------------
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler <https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime>`__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
+In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. 
+
+.. note::
+   Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+
+Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+
+.. code-block::
+
+      INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+      INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+      INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+      INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+
+``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+
+Example (with ramp-up)
+
+.. code-block:: 
+   
+    min = 2, step = 32, max = 64
+    => ramp_up = (2, 4, 8, 16)
+    => stable = (32, 64)
+    => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+
+Example (without ramp-up)
+
+.. code-block:: 
+   
+    min = 128, step = 128, max = 512
+    => ramp_up = ()
+    => stable = (128, 256, 384, 512)
+    => buckets = ramp_up + stable => (128, 256, 384, 512)
+
+
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. 
+
+.. warning::
+   If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. 
+
+.. note::
+   Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+
+Warmup
+------
+
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+
+.. code-block::
+
+   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+   INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+   ...
+   INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+   INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+   ...
+   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+
+This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. 
+
+.. tip::
+   Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+
+HPU Graph capture
+-----------------
+
+`HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). 
+Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. 
+Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value,  will mark 90% of free device memory at that point as usable.
+Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
+Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
+With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
+Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints.
+Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
+
+.. note:: 
+   ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.   
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
+-    ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode
+-    ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy.
+
+
+.. note::
+   ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+.. code-block::
+
+   INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+   INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+   INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+   INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+   INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+   INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+   INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+   ...
+   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   ...
+   INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+   INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+   ...
+   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+   INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+   INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+   INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+
+
+Recommended vLLM Parameters
+---------------------------
+
+-  We recommend running inference on Gaudi 2 with ``block_size`` of 128
+   for BF16 data type. Using default values (16, 32) might lead to
+   sub-optimal performance due to Matrix Multiplication Engine
+   under-utilization (see `Gaudi
+   Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html>`__).
+-  For max throughput on Llama 7B, we recommend running with batch size
+   of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+   If you encounter out-of-memory issues, see troubleshooting section.
+
+Environment variables
+---------------------
+
+**Diagnostic and profiling knobs:**
+
+-   ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai <https://perfetto.habana.ai/#!/viewer>`__. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+
+**Performance tuning knobs:**
+
+-   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
+-   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
+-   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default
+-   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
+-   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
+-   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
+
+    - ``{phase}`` is either ``PROMPT`` or ``DECODE``
+    - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK``
+    - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
+    - Default values:
+
+      - Prompt:
+         - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1``
+         - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
+         - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)``
+         - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size``
+         - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size``
+         - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
+
+      - Decode:
+         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
+         - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
+         - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
+         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
+         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
+         - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
+
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  
+
+-   ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default 
+-   ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
+
+Troubleshooting: Tweaking HPU Graphs
+====================================
+
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+-  Tweak ``gpu_memory_utilization`` knob. It will decrease the
+   allocation of KV cache, leaving some headroom for capturing graphs
+   with larger batch size. By default ``gpu_memory_utilization`` is set
+   to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
+   short profiling run. Note that decreasing reduces the number of KV
+   cache blocks you have available, and therefore reduces the effective
+   maximum number of tokens you can handle at a given time.
+
+-  If this method is not efficient, you can disable ``HPUGraph``
+   completely. With HPU Graphs disabled, you are trading latency and
+   throughput at lower batches for potentially higher throughput on
+   higher batches. You can do that by adding ``--enforce-eager`` flag to
+   server (for online inference), or by passing ``enforce_eager=True``
+   argument to LLM constructor (for offline inference).
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2399fcf5faec9..51add1fd4d0ab 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
 * Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 * Prefix caching support
 * Multi-lora support
 
@@ -66,6 +66,7 @@ Documentation
    getting_started/amd-installation
    getting_started/openvino-installation
    getting_started/cpu-installation
+   getting_started/gaudi-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
    getting_started/xpu-installation
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
new file mode 100644
index 0000000000000..4674efb812cfd
--- /dev/null
+++ b/requirements-hpu.txt
@@ -0,0 +1,11 @@
+# Common dependencies
+-r requirements-common.txt
+
+# Dependencies for HPU code
+ray
+triton
+pandas
+tabulate
+setuptools>=61
+setuptools-scm>=8
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
diff --git a/setup.py b/setup.py
index f145a33258d70..51ca5e2abecf7 100644
--- a/setup.py
+++ b/setup.py
@@ -253,6 +253,24 @@ def run(self):
             self.copy_file(file, dst_file)
 
 
+def _is_hpu() -> bool:
+    is_hpu_available = True
+    try:
+        subprocess.run(["hl-smi"], capture_output=True, check=True)
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
+                '/dev/accel/accel_controlD0'):
+            # last resort...
+            try:
+                output = subprocess.check_output(
+                    'lsmod | grep habanalabs | wc -l', shell=True)
+                is_hpu_available = int(output) > 0
+            except (ValueError, FileNotFoundError, PermissionError,
+                    subprocess.CalledProcessError):
+                is_hpu_available = False
+    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
+
+
 def _no_device() -> bool:
     return VLLM_TARGET_DEVICE == "empty"
 
@@ -260,7 +278,7 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
     has_cuda = torch.version.cuda is not None
     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+            and not (_is_neuron() or _is_tpu() or _is_hpu()))
 
 
 def _is_hip() -> bool:
@@ -356,6 +374,23 @@ def get_path(*filepath) -> str:
     return os.path.join(ROOT_DIR, *filepath)
 
 
+def get_gaudi_sw_version():
+    """
+    Returns the driver version.
+    """
+    # Enable console printing for `hl-smi` check
+    output = subprocess.run("hl-smi",
+                            shell=True,
+                            text=True,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE,
+                            env={"ENABLE_CONSOLE": "true"})
+    if output.returncode == 0 and output.stdout:
+        return output.stdout.split("\n")[2].replace(
+            " ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0"  # when hl-smi is not available
+
+
 def get_vllm_version() -> str:
     version = get_version(
         write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
@@ -385,6 +420,12 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"{sep}neuron{neuron_version_str}"
+    elif _is_hpu():
+        # Get the Intel Gaudi Software Suite version
+        gaudi_sw_version = str(get_gaudi_sw_version())
+        if gaudi_sw_version != MAIN_CUDA_VERSION:
+            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
+            version += f"{sep}gaudi{gaudi_sw_version}"
     elif _is_openvino():
         version += f"{sep}openvino"
     elif _is_tpu():
@@ -443,6 +484,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-rocm.txt")
     elif _is_neuron():
         requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_hpu():
+        requirements = _read_requirements("requirements-hpu.txt")
     elif _is_openvino():
         requirements = _read_requirements("requirements-openvino.txt")
     elif _is_tpu():
@@ -453,7 +496,7 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
             "OpenVINO, or CPU.")
     return requirements
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 46a2fb8bc80a2..682e08db99fa9 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -12,7 +12,7 @@
 
 logger = init_logger(__name__)
 
-if not current_platform.is_tpu():
+if not current_platform.is_tpu() and not current_platform.is_hpu():
     try:
         import vllm._C
     except ImportError as e:
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
new file mode 100644
index 0000000000000..a8f4b09b67274
--- /dev/null
+++ b/vllm/attention/backends/hpu_attn.py
@@ -0,0 +1,264 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import vllm_hpu_extension.ops as ops
+from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
+                                               HPUPagedAttentionMetadata)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class HPUAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["HPUAttentionImpl"]:
+        return HPUAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return HPUAttentionMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                    num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        HPUPagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
+    """Metadata for HPUAttentionbackend."""
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    is_prompt: bool
+    attn_bias: Optional[torch.Tensor]
+    seq_lens_tensor: Optional[torch.Tensor]
+
+
+class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+
+    Otherwise, the layout is as follows:
+    |<----------------- num_decode_tokens ------------------>|
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        max_seq_len: int = 4096,
+    ) -> None:
+        super(AttentionImpl, self).__init__()
+        self.kv_cache_dtype = kv_cache_dtype
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.matmul_qk = Matmul()
+        self.softmax = Softmax()
+        self.matmul_av = Matmul()
+        self.k_cache = VLLMKVCache()
+        self.v_cache = VLLMKVCache()
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        self.alibi_slopes = alibi_slopes
+        if alibi_slopes is not None:
+            alibi_slopes_tensor = torch.tensor(alibi_slopes,
+                                               dtype=torch.bfloat16)
+            self.alibi_slopes = alibi_slopes_tensor
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
+                                              '0').lower() in ['1', 'true']
+        if self.prefill_usefusedsdpa:
+            assert alibi_slopes is None, \
+                'Prefill with FusedSDPA not supported with alibi slopes!'
+
+        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: HPUAttentionMetadata,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> torch.Tensor:
+        """Forward pass with xFormers and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "HPUAttentionImpl")
+        batch_size, seq_len, hidden_size = query.shape
+        _, seq_len_kv, _ = key.shape
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+        block_indices = attn_metadata.block_indices
+        block_offsets = attn_metadata.block_offsets
+        if attn_metadata.is_prompt:
+            key = key.unflatten(0, (block_indices.size(0), -1))
+            value = value.unflatten(0, (block_indices.size(0), -1))
+        if kv_cache is not None:
+            key_cache, value_cache = HPUPagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            key_cache = self.k_cache(key, key_cache, block_indices,
+                                     block_offsets)
+            value_cache = self.v_cache(value, value_cache, block_indices,
+                                       block_offsets)
+
+        if attn_metadata.is_prompt:
+            # Prompt run.
+            if not self.prefill_usefusedsdpa:
+                # TODO: move this outside of model
+                assert attn_metadata.attn_bias is not None, \
+                        'attn_bias must be set before calling model.forward!'
+                attn_bias = attn_metadata.attn_bias
+                if self.alibi_slopes is not None:
+                    position_bias = _make_alibi_bias(self.alibi_slopes,
+                                                     self.num_kv_heads,
+                                                     attn_bias.dtype,
+                                                     attn_bias.shape[-1])
+                    attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
+                    attn_bias.add_(position_bias)
+            else:
+                attn_bias = None
+
+            query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
+            kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
+                        self.head_size)
+            out = ops.prompt_attention(
+                query.view(query_shape),
+                key.view(kv_shape),
+                value.view(kv_shape),
+                attn_bias=attn_bias,
+                p=0.0,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                softmax_op=self.softmax,
+                matmul_av_op=self.matmul_av,
+            )
+            output = out.reshape(batch_size, seq_len, hidden_size)
+        else:
+            # Decoding run.
+            output = HPUPagedAttention.forward_decode(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_list=attn_metadata.block_list,
+                block_mapping=attn_metadata.block_mapping,
+                block_bias=attn_metadata.attn_bias,
+                block_scales=attn_metadata.block_scales,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                matmul_av_op=self.matmul_av,
+                keys_fetch_func=self.k_cache.fetch_from_cache,
+                values_fetch_func=self.v_cache.fetch_from_cache)
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    dtype: torch.dtype,
+    seq_len: int,
+) -> torch.Tensor:
+    bias = torch.arange(seq_len, dtype=dtype)
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(seq_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    # Calculate a matrix where each element represents ith element- jth
+    # element.
+    bias = bias[None, :] - bias[:, None]
+
+    padded_len = (seq_len + 7) // 8 * 8
+    num_heads = alibi_slopes.shape[0]
+    bias = torch.empty(
+        1,  # batch size
+        num_heads,
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    if num_heads != num_kv_heads:
+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+    return bias
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
new file mode 100644
index 0000000000000..4c0fb2a628361
--- /dev/null
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -0,0 +1,103 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from vllm_hpu_extension import cache_ops, ops
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+@dataclass
+class HPUPagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+    block_list: Optional[torch.Tensor]
+    block_mapping: Optional[torch.Tensor]
+    block_usage: Optional[torch.Tensor]
+    block_indices: Optional[torch.Tensor]
+    block_offsets: Optional[torch.Tensor]
+    block_scales: Optional[torch.Tensor]
+
+
+class HPUPagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        key_cache = kv_cache[0]
+        value_cache = kv_cache[1]
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
+                             key_cache: torch.Tensor,
+                             value_cache: torch.Tensor,
+                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
+                             is_prompt: bool) -> None:
+        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                    slot_mapping, kv_cache_dtype, is_prompt)
+
+    @staticmethod
+    def forward_decode(**kwargs) -> torch.Tensor:
+        return ops.flat_pa(**kwargs)
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        subquery_start_loc: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_query_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            "forward_prefix is not implemented for HPUPagedAttention")
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 8a59cf41a689e..991602da2853a 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -23,6 +23,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
+    HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
     NO_ATTENTION = enum.auto()
@@ -145,6 +146,10 @@ def get_attn_backend(
         logger.info("Using Flashinfer backend.")
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
+    elif backend == _Backend.HPU_ATTN:
+        logger.info("Using HPUAttention backend.")
+        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
+        return HPUAttentionBackend
     elif backend == _Backend.PALLAS:
         logger.info("Using Pallas backend.")
         from vllm.attention.backends.pallas import PallasAttentionBackend
@@ -220,6 +225,9 @@ def which_attn_to_use(
             logger.info("%s is not supported in AMD GPUs.", selected_backend)
         return _Backend.ROCM_FLASH
 
+    if current_platform.is_hpu():
+        return _Backend.HPU_ATTN
+
     if envs.VLLM_USE_V1:
         return _Backend.FLASH_ATTN_VLLM_V1
 
diff --git a/vllm/config.py b/vllm/config.py
index 851d35dfd9fb0..91bbbfec4b7b3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -466,9 +466,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
 
         # Reminder: Please update docs/source/serving/compatibility_matrix.rst
         # If the feature combo become valid
-        if device_config.device_type not in ("cuda", "tpu", "xpu"):
+        if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"):
             logger.warning(
-                "Async output processing is only supported for CUDA, TPU, XPU. "
+                "Async output processing is only supported for CUDA, TPU, XPU "
+                "and HPU."
                 "Disabling it for other platforms.")
             self.use_async_output_proc = False
             return
@@ -860,7 +861,6 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's
             checkpoints.
-
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
@@ -964,6 +964,13 @@ def __init__(
                 raise ValueError(
                     "TPU backend only supports Ray for distributed inference.")
 
+        if current_platform.is_hpu() and self.world_size > 1:
+            if self.distributed_executor_backend is None:
+                self.distributed_executor_backend = "ray"
+            if self.distributed_executor_backend != "ray":
+                raise ValueError(
+                    "HPU backend only supports Ray for distributed inference.")
+
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.
@@ -1166,6 +1173,8 @@ def __init__(self, device: str = "auto") -> None:
                 self.device_type = "cuda"
             elif current_platform.is_neuron():
                 self.device_type = "neuron"
+            elif current_platform.is_hpu():
+                self.device_type = "hpu"
             elif current_platform.is_openvino():
                 self.device_type = "openvino"
             elif current_platform.is_tpu():
@@ -1745,6 +1754,13 @@ def _get_and_verify_dtype(
                     torch_dtype = torch.float16
             else:
                 torch_dtype = config_dtype
+
+            if current_platform.is_hpu() and config_dtype == torch.float16:
+                logger.info(
+                    "For HPU, we cast models to bfloat16 instead of"
+                    "using float16 by default. Please specify `dtype` if you "
+                    "want to use float16.")
+                torch_dtype = torch.bfloat16
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                 raise ValueError(f"Unknown dtype: {dtype}")
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 6eda5f99aa1c8..9727f6e19b84e 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -4,6 +4,7 @@
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
+from vllm.platforms import current_platform
 from vllm.utils import Device
 
 
@@ -52,7 +53,11 @@ def create(
             - The block IDs are assigned contiguously, with GPU block IDs coming
                 before CPU block IDs.
         """
-        block_ids = list(range(num_gpu_blocks + num_cpu_blocks))
+        # For HPU, block id 0 is used only for padding
+        reserved_blocks = 1 if current_platform.is_hpu() else 0
+        block_ids = list(
+            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
+        num_gpu_blocks -= reserved_blocks
         gpu_block_ids = block_ids[:num_gpu_blocks]
         cpu_block_ids = block_ids[num_gpu_blocks:]
 
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
new file mode 100644
index 0000000000000..cc9b19ce022b5
--- /dev/null
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -0,0 +1,48 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.platforms import current_platform
+
+if current_platform.is_hpu():
+    import habana_frameworks.torch as htorch  # noqa: F401
+
+
+class HpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not current_platform.is_hpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+        # (which is required for tensor parallel HPUGraph inference)
+        htorch.core.mark_step()
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += x.dim()
+        input_size = x.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((world_size, ) + input_size,
+                                    dtype=x.dtype,
+                                    device=x.device)
+        # All-gather.
+        htorch.core.mark_step()
+        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 94ba41a016f6d..efa3525910a5e 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -177,6 +177,7 @@ def __init__(
         use_pynccl: bool,
         use_custom_allreduce: bool,
         use_tpu_communicator: bool,
+        use_hpu_communicator: bool,
         use_message_queue_broadcaster: bool = False,
         group_name: Optional[str] = None,
     ):
@@ -213,6 +214,7 @@ def __init__(
         self.use_pynccl = use_pynccl
         self.use_custom_allreduce = use_custom_allreduce
         self.use_tpu_communicator = use_tpu_communicator
+        self.use_hpu_communicator = use_hpu_communicator
 
         # lazy import to avoid documentation build error
         from vllm.distributed.device_communicators.custom_all_reduce import (
@@ -241,6 +243,12 @@ def __init__(
         if use_tpu_communicator and self.world_size > 1:
             self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
 
+        from vllm.distributed.device_communicators.hpu_communicator import (
+            HpuCommunicator)
+        self.hpu_communicator: Optional[HpuCommunicator]
+        if use_hpu_communicator and self.world_size > 1:
+            self.hpu_communicator = HpuCommunicator(group=self.device_group)
+
         from vllm.distributed.device_communicators.shm_broadcast import (
             MessageQueue)
         self.mq_broadcaster: Optional[MessageQueue] = None
@@ -362,6 +370,10 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
             # TPU handles Dynamo with its own logic.
             return self.tpu_communicator.all_reduce(input_)
 
+        if self.hpu_communicator is not None and \
+            not self.hpu_communicator.disabled:
+            return self.hpu_communicator.all_reduce(input_)
+
         if self.ca_comm is not None and \
             not self.ca_comm.disabled and \
                 self.ca_comm.should_custom_ar(input_):
@@ -400,6 +412,11 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         if tpu_comm is not None and not tpu_comm.disabled:
             return tpu_comm.all_gather(input_, dim)
 
+        # For HPUs, use HPU communicator.
+        hpu_comm = self.hpu_communicator
+        if hpu_comm is not None and not hpu_comm.disabled:
+            return hpu_comm.all_gather(input_, dim)
+
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -879,6 +896,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         use_pynccl=False,
         use_custom_allreduce=False,
         use_tpu_communicator=False,
+        use_hpu_communicator=False,
         group_name="world",
     )
 
@@ -900,6 +918,7 @@ def init_model_parallel_group(
         use_pynccl=True,
         use_custom_allreduce=use_custom_allreduce,
         use_tpu_communicator=True,
+        use_hpu_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bd39e72d58caa..b556c0eed3776 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -17,6 +17,7 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.utils import check_gguf_file
@@ -37,6 +38,7 @@
     "openvino",
     "tpu",
     "xpu",
+    "hpu",
 ]
 
 
@@ -110,7 +112,9 @@ class EngineArgs:
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
-    block_size: int = 16
+    # NOTE(kzawora): default block size for Gaudi should be 128
+    # smaller sizes still work, but very inefficiently
+    block_size: int = 16 if not current_platform.is_hpu() else 128
     enable_prefix_caching: bool = False
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
@@ -397,7 +401,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
                             'set to max-model-len')
@@ -1132,8 +1136,7 @@ def create_engine_config(self) -> VllmConfig:
             multi_step_stream_outputs=self.multi_step_stream_outputs,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
-            policy=self.scheduling_policy,
-        )
+            policy=self.scheduling_policy)
         lora_config = LoRAConfig(
             max_lora_rank=self.max_lora_rank,
             max_loras=self.max_loras,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 161b85646b6e8..1a371b52bb64b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -627,6 +627,14 @@ def _get_executor_cls(
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
+        elif engine_config.device_config.device_type == "hpu":
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
+                executor_class = RayHPUExecutorAsync
+            else:
+                from vllm.executor.hpu_executor import HPUExecutorAsync
+                executor_class = HPUExecutorAsync
         elif engine_config.device_config.device_type == "openvino":
             assert distributed_executor_backend is None, (
                 "Distributed execution is not supported with "
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 404e7ed2c6ef9..5d321fc98aeb6 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -528,6 +528,14 @@ def _get_executor_cls(cls,
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
+        elif engine_config.device_config.device_type == "hpu":
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_hpu_executor import RayHPUExecutor
+                executor_class = RayHPUExecutor
+            else:
+                from vllm.executor.hpu_executor import HPUExecutor
+                executor_class = HPUExecutor
         elif engine_config.device_config.device_type == "openvino":
             from vllm.executor.openvino_executor import OpenVINOExecutor
             executor_class = OpenVINOExecutor
diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
new file mode 100644
index 0000000000000..220e9eee87bb3
--- /dev/null
+++ b/vllm/executor/hpu_executor.py
@@ -0,0 +1,205 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import contextlib
+import os
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class HPUExecutor(ExecutorBase):
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model."""
+        self._init_worker()
+
+    def _get_worker_kwargs(
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=rank == 0,
+        )
+
+    def _create_worker(self,
+                       local_rank: int = 0,
+                       rank: int = 0,
+                       distributed_init_method: Optional[str] = None):
+        wrapper = WorkerWrapperBase(
+            worker_module_name="vllm.worker.hpu_worker",
+            worker_class_name="HPUWorker",
+        )
+        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+                                                      distributed_init_method))
+        return wrapper.worker
+
+    def _init_worker(self):
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
+
+        self.driver_worker = self._create_worker()
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
+        with HabanaMemoryProfiler() as cache_init_m:
+            self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+        msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
+        logger.info(msg)
+
+    def finish_measurements(self):
+        self.driver_worker.finish_measurements()
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
+        log_graph_compilation_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+        log_graph_compilation = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
+            '0') != '0' or log_graph_compilation_all
+        log_cpu_fallbacks_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
+        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
+                                           '0') != '0' or log_cpu_fallbacks_all
+        if log_graph_compilation or log_cpu_fallbacks:
+            from habana_frameworks.torch.hpu.metrics import metric_localcontext
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+            is_prompt = any([
+                seq_group_metadata.is_prompt
+                for seq_group_metadata in seq_group_metadata_list
+            ])
+            max_context_len = max([
+                max([
+                    len(v.prompt_token_ids) + len(v.output_token_ids)
+                    for v in seq_group_metadata.seq_data.values()
+                ]) for seq_group_metadata in seq_group_metadata_list
+            ])  # whoa, that's some spicy stuff right here
+            max_num_blocks = (
+                (max_context_len - 1) // self.cache_config.block_size) + 1
+            input_stats = (f'is_prompt: {is_prompt}, '
+                           f'num_seqs: {len(seq_group_metadata_list)}, '
+                           f'max_context_len: {max_context_len}, '
+                           f'max_num_blocks {max_num_blocks}')
+            gc_ctx = metric_localcontext(
+                "graph_compilation"
+            ) if log_graph_compilation else contextlib.nullcontext()
+            cpu_fallback_ctx = metric_localcontext(
+                "cpu_fallback"
+            ) if log_cpu_fallbacks else contextlib.nullcontext()
+            with gc_ctx as gc_local_metric, \
+                cpu_fallback_ctx as cpu_fallback_local_metric:
+                output = self.driver_worker.execute_model(execute_model_req)
+            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
+                ) or log_graph_compilation_all:
+                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
+                       f"{gc_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
+                    0) or log_cpu_fallbacks_all:
+                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
+                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+
+            return output
+
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def check_health(self) -> None:
+        # GPUExecutor will always be healthy as long as
+        # it's running.
+        return
+
+    def start_profile(self) -> None:
+        self.driver_worker.start_profile()
+
+    def stop_profile(self) -> None:
+        self.driver_worker.stop_profile()
+
+    def shutdown(self) -> None:
+        self.driver_worker.shutdown_inc()
+
+
+class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
+        return output
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
new file mode 100644
index 0000000000000..28d1882cb0db7
--- /dev/null
+++ b/vllm/executor/ray_hpu_executor.py
@@ -0,0 +1,554 @@
+import asyncio
+import os
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
+                    Type)
+
+import msgspec
+
+import vllm.envs as envs
+from vllm.executor.distributed_gpu_executor import (  # yapf: disable
+    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.msgspec_utils import encode_hook
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, get_vllm_instance_id,
+                        make_async)
+from vllm.worker.worker_base import WorkerBase
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+
+class RayHPUExecutor(DistributedGPUExecutor):
+
+    uses_ray: bool = True
+
+    def _init_executor(self) -> None:
+        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        # If the env var is set, it uses the Ray's compiled DAG API
+        # which optimizes the control plane overhead.
+        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+        # Currently, this requires USE_RAY_SPMD_WORKER=True.
+        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
+        # If the env var is set, then we do not distinguish between the
+        # "driver worker" vs other workers. Also, the rank 0 worker will
+        # be executed in a remote Ray worker. Currently this requires
+        # USE_RAY_COMPILED_DAG=True.
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if self.use_ray_compiled_dag:
+            assert self.use_ray_spmd_worker, (
+                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
+                "VLLM_USE_RAY_SPMD_WORKER=1")
+        if self.use_ray_spmd_worker:
+            # TODO: Support SPMD worker for non-DAG Ray executor.
+            assert self.use_ray_compiled_dag, (
+                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
+                "VLLM_USE_RAY_COMPILED_DAG=1")
+
+        assert self.uses_ray
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+        self.output_decoder = msgspec.msgpack.Decoder(
+            Optional[List[SamplerOutput]])
+
+    def shutdown(self) -> None:
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+
+    def finish_measurements(self):
+        self._run_workers("finish_measurements")
+
+    def _get_worker_module_and_class(
+        self
+    ) -> Tuple[str, str, Optional[Callable[[],
+                                           Type[WorkerBase]]]]:  # noqa: F821
+        worker_class_fn = None
+        if self.scheduler_config.is_multi_step:
+            raise NotImplementedError(
+                "Multi-step execution is not implemented for HPU")
+        elif self.speculative_config:
+            raise NotImplementedError(
+                "Speculative decoding is not implemented for HPU")
+        else:
+            worker_module_name = "vllm.worker.hpu_worker"
+            worker_class_name = "HPUWorker"
+        return (worker_module_name, worker_class_name, worker_class_fn)
+
+    def _get_worker_wrapper_args(self) -> Dict[str, Any]:
+        (worker_module_name, worker_class_name,
+         worker_class_fn) = self._get_worker_module_and_class()
+
+        return dict(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+            worker_class_fn=worker_class_fn,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        # Otherwise, the ray workers are allocated with a full GPU.
+        num_gpus = 1
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
+
+        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
+
+        # Create the workers.
+        driver_ip = get_ip()
+        worker_wrapper_kwargs = self._get_worker_wrapper_args()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("HPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=0,
+                resources={'HPU': num_gpus},
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
+
+            if self.use_ray_spmd_worker:
+                self.workers.append(worker)
+            else:
+                worker_ip = ray.get(worker.get_node_ip.remote())
+                if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                    # If the worker is on the same node as the driver, we use it
+                    # as the resource holder for the driver process.
+                    self.driver_dummy_worker = worker
+                    self.driver_worker = RayWorkerWrapper(
+                        **worker_wrapper_kwargs)
+                else:
+                    # Else, added to the list of workers.
+                    self.workers.append(worker)
+
+        logger.debug("workers: %s", self.workers)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
+        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+
+        worker_ips = [
+            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
+            for worker in self.workers
+        ]
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(worker):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = ray.get(worker.get_node_ip.remote())
+            return (ip != driver_ip, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        all_ips = set(worker_ips + [driver_ip])
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP` or "
+                "`HOST_IP` environment variable, make sure it is unique for"
+                " each node.")
+
+        VLLM_INSTANCE_ID = get_vllm_instance_id()
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [({
+            "VLLM_INSTANCE_ID":
+            VLLM_INSTANCE_ID,
+            "VLLM_TRACE_FUNCTION":
+            str(envs.VLLM_TRACE_FUNCTION),
+        }, ) for (node_id, _) in worker_node_and_gpu_ids]
+        self._run_workers("update_environment_variables",
+                          all_args=all_args_to_update_environment_variables)
+
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Initialize the actual workers inside worker wrapper.
+        init_worker_all_kwargs = [
+            self._get_worker_kwargs(
+                local_rank=node_workers[node_id].index(rank),
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+        ]
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+
+        if self.use_ray_spmd_worker:
+            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+                self.pp_tp_workers.append([])
+                for tp_rank in range(
+                        self.parallel_config.tensor_parallel_size):
+                    # PP=2, TP=4
+                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
+                            ) + tp_rank
+                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                    assert pp_rank < len(self.pp_tp_workers)
+                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
+
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+
+        # Enforce rank order for correct rank to return final output.
+        for index, worker in enumerate(self.workers):
+            # The driver worker is rank 0 and not in self.workers.
+            rank = index + 1
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(worker)
+            else:
+                self.non_driver_workers.append(worker)
+
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return super().execute_model(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        serialized_data = self.input_encoder.encode(execute_model_req)
+        outputs = ray.get(self.forward_dag.execute(serialized_data))
+        output = self.output_decoder.decode(outputs[0])
+        return output
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        Args:
+        - async_run_tensor_parallel_workers_only: If True the method will be
+          run only in the remote TP workers, not the driver worker.
+          It will also be run asynchronously and return a list of futures
+          rather than blocking on the results.
+        - args/kwargs: All workers share the same args/kwargs
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+        if self.use_ray_spmd_worker:
+            assert not async_run_tensor_parallel_workers_only, (
+                "async_run_tensor_parallel_workers_only is not supported for "
+                "spmd mode.")
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        count = len(self.workers) if not \
+            async_run_tensor_parallel_workers_only \
+            else len(self.non_driver_workers)
+        # If using SPMD worker, all workers are the same, so we should execute
+        # the args on all workers. Otherwise, we skip the first worker's args
+        # because those args will go to the driver worker.
+        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, first_worker_args_index, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, first_worker_args_index, None)
+
+        # Start the ray workers first.
+        ray_workers = self.workers
+        if async_run_tensor_parallel_workers_only:
+            ray_workers = self.non_driver_workers
+        ray_worker_outputs = [
+            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
+        ]
+
+        if async_run_tensor_parallel_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+
+        driver_worker_output = []
+        # In SPMD mode, the driver worker is the same as any other worker,
+        # so we only explicitly execute on the driver worker if using a
+        # non-SPMD worker class.
+        if not self.use_ray_spmd_worker:
+            driver_args = args if all_args is None else all_args[0]
+            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+            # Start the driver worker after all the ray workers.
+            if not use_dummy_driver:
+                driver_worker_output = [
+                    self.driver_worker.execute_method(method, *driver_args,
+                                                      **driver_kwargs)
+                ]
+            else:
+                assert self.driver_dummy_worker is not None
+                driver_worker_output = [
+                    ray.get(
+                        self.driver_dummy_worker.execute_method.remote(
+                            method, *driver_args, **driver_kwargs))
+                ]
+
+        # Get the results of the ray workers.
+        if self.workers:
+            ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return driver_worker_output + ray_worker_outputs
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+
+    def _check_ray_adag_installation(self):
+        import pkg_resources
+        from packaging import version
+
+        required_version = version.parse("2.35")
+        current_version = version.parse(
+            pkg_resources.get_distribution("ray").version)
+        # TODO: update the constraint once we adapt to the backward
+        # incompatible API change from ray 2.36
+        if current_version != required_version:
+            raise ValueError(f"Ray version {required_version} is "
+                             f"required, but found {current_version}")
+
+        import importlib.util
+        adag_spec = importlib.util.find_spec(
+            "ray.experimental.compiled_dag_ref")
+        if adag_spec is None:
+            raise ValueError("Ray accelerated DAG is not installed. "
+                             "Run `pip install ray[adag]` to install it.")
+
+    def _compiled_ray_dag(self, enable_asyncio: bool):
+        assert self.parallel_config.use_ray
+        self._check_ray_adag_installation()
+        from ray.dag import InputNode, MultiOutputNode
+        from ray.experimental.channel.torch_tensor_type import TorchTensorType
+
+        with InputNode() as input_data:
+            # Example DAG: PP=2, TP=4
+            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
+            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
+            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
+            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
+
+            # All workers in the first TP group will take in the
+            # ExecuteModelRequest as input.
+            outputs = [input_data for _ in self.pp_tp_workers[0]]
+            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
+                # Each PP worker takes in the output of the previous PP worker,
+                # and the TP group executes in SPMD fashion.
+                outputs = [
+                    worker.execute_model_spmd.
+                    bind(  # type: ignore[attr-defined]
+                        outputs[i]) for i, worker in enumerate(tp_group)
+                ]
+
+                last_pp_rank = len(self.pp_tp_workers) - 1
+                if pp_rank < last_pp_rank:
+                    # Specify how intermediate tensors should be passed
+                    # between pp stages, no need to specify for the last
+                    # pp stage.
+                    transport = "auto"
+                    outputs = [
+                        output.with_type_hint(
+                            TorchTensorType(transport=transport))
+                        for output in outputs
+                    ]
+
+            forward_dag = MultiOutputNode(outputs)
+
+        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
+
+    def __del__(self):
+        self.shutdown()
+
+
+class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if not self.use_ray_compiled_dag:
+            self.driver_exec_method = make_async(
+                self.driver_worker.execute_method)
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return await super().execute_model_async(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
+
+        serialized_data = self.input_encoder.encode(execute_model_req)
+        dag_future = await self.forward_dag.execute_async(serialized_data)
+        outputs = await dag_future
+        return self.output_decoder.decode(outputs[0])
+
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        if not self.tp_driver_workers:
+            return await self.driver_exec_method("execute_model",
+                                                 execute_model_req)
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+        tasks = [
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
+                                    "execute_model", execute_model_req))
+        ]
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method.remote,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+
+        results = await asyncio.gather(*tasks)
+
+        # Only the last PP stage has the final results.
+        return results[-1]
+
+    async def _start_worker_execution_loop(self):
+        assert not self.use_ray_spmd_worker, (
+            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.non_driver_workers
+        ]
+        return await asyncio.gather(*coros)
+
+    def __del__(self):
+        self.shutdown()
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 993d279890820..41dd59bc65ec5 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -249,7 +249,11 @@ def initialize_ray_cluster(
         # Placement group is already set.
         return
 
-    device_str = "GPU" if not current_platform.is_tpu() else "TPU"
+    device_str = "GPU"
+    if current_platform.is_tpu():
+        device_str = "TPU"
+    elif current_platform.is_hpu():
+        device_str = 'HPU'
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index bfca15c2b6a3e..24d75f4df4e02 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -55,10 +55,9 @@ def forward_tpu(self, *args, **kwargs):
         # NOTE(woosuk): This is a placeholder for future extensions.
         return self.forward_native(*args, **kwargs)
 
-    def forward_gaudi(self, *args, **kwargs):
+    def forward_hpu(self, *args, **kwargs):
         # By default, we assume that Gaudi ops are compatible with the
         # PyTorch-native implementation.
-        # NOTE(woosuk): This is a placeholder for future extensions.
         return self.forward_native(*args, **kwargs)
 
     def dispatch_forward(self):
@@ -76,6 +75,8 @@ def dispatch_forward(self):
             return self.forward_hip
         elif current_platform.is_cpu():
             return self.forward_cpu
+        elif current_platform.is_hpu():
+            return self.forward_hpu
         elif current_platform.is_tpu():
             return self.forward_tpu
         elif current_platform.is_xpu():
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 30b43f375dd5c..345919c5d1636 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -92,6 +92,25 @@ def forward_cuda(
         )
         return out
 
+    def forward_hpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from vllm_hpu_extension.ops import HPUFusedRMSNorm
+        if HPUFusedRMSNorm is None:
+            return self.forward_native(x, residual)
+        if residual is not None:
+            orig_shape = x.shape
+            residual += x.view(residual.shape)
+            # Note: HPUFusedRMSNorm requires 3D tensors as inputs
+            x = HPUFusedRMSNorm.apply(residual, self.weight,
+                                      self.variance_epsilon)
+            return x.view(orig_shape), residual
+
+        x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon)
+        return x
+
     def forward_xpu(
         self,
         x: torch.Tensor,
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 288f5a1134b6b..fb76b1b17925e 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -111,8 +111,14 @@ def _prune_hidden_states(
     hidden_states: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    return hidden_states.index_select(0,
-                                      sampling_metadata.selected_token_indices)
+    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
+    # (warmup, profile_run) we might not have selected_token_indices,
+    # so we skip pruning.
+    if sampling_metadata.selected_token_indices is not None:
+        return hidden_states.index_select(
+            0, sampling_metadata.selected_token_indices)
+    else:
+        return hidden_states
 
 
 def _apply_logits_processors(
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index ac60e0e6d48a0..63ceec63e8317 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -194,6 +194,61 @@ def forward_xpu(
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def forward_hpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from habana_frameworks.torch.hpex.kernels import (
+            RotaryPosEmbeddingMode, apply_rotary_pos_emb)
+        positions = positions.flatten()
+        if offsets is not None:
+            positions = positions + offsets
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions).view(
+            num_tokens, 1, -1)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
+        # to query hidden dimension, so the original tensors need to be
+        # expanded
+        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
+        # and expansion of cos/sin tensors via concatenation
+        # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
+        # and expansion of cos/sin tensors via repeat_interleave
+        rope_mode: RotaryPosEmbeddingMode
+        if self.is_neox_style:
+            rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
+            cos = torch.cat((cos, cos), dim=-1)
+            sin = torch.cat((sin, sin), dim=-1)
+        else:
+            rope_mode = RotaryPosEmbeddingMode.PAIRWISE
+            sin = torch.repeat_interleave(sin,
+                                          2,
+                                          dim=-1,
+                                          output_size=cos_sin.shape[-1])
+            cos = torch.repeat_interleave(cos,
+                                          2,
+                                          dim=-1,
+                                          output_size=cos_sin.shape[-1])
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0,
+                                         rope_mode)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index b448557af13b3..52771f50a7a23 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -12,6 +12,7 @@
     QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
 from vllm.model_executor.parameter import BasevLLMParameter
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 
 DEFAULT_VOCAB_PADDING_SIZE = 64
 
@@ -382,8 +383,20 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
 
         # Copy the data.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
-        param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
-        param[loaded_weight.shape[0]:].data.fill_(0)
+
+        if current_platform.is_hpu():
+            # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here,
+            # so we're using a workaround. Remove this when fixed in
+            # HPU PT bridge.
+            padded_weight = torch.cat([
+                loaded_weight,
+                torch.zeros(param.shape[0] - loaded_weight.shape[0],
+                            *loaded_weight.shape[1:])
+            ])
+            param.data.copy_(padded_weight)
+        else:
+            param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
+            param[loaded_weight.shape[0]:].data.fill_(0)
 
     def forward(self, input_):
         if self.tp_size > 1:
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index ee02368bec8a8..84f35f75a0c32 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -284,7 +284,8 @@ def _prepare_seq_groups(
         else:
             # Decode
             prompt_logprob_len = 0
-            query_len = query_lens[i] if query_lens is not None else 1
+            query_len = query_lens[i] if query_lens is not None and len(
+                query_lens) > 0 else 1
             sample_len = len(seq_ids) * query_len if do_sample else 0
 
             if sampling_params.seed is not None and generators is not None:
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 524150920b854..9e740837381f8 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -42,6 +42,13 @@
 except Exception:
     pass
 
+is_hpu = False
+try:
+    from importlib import util
+    is_hpu = util.find_spec('habana_frameworks') is not None
+except Exception:
+    pass
+
 is_xpu = False
 
 try:
@@ -86,6 +93,9 @@
 elif is_rocm:
     from .rocm import RocmPlatform
     current_platform = RocmPlatform()
+elif is_hpu:
+    from .hpu import HpuPlatform
+    current_platform = HpuPlatform()
 elif is_xpu:
     from .xpu import XPUPlatform
     current_platform = XPUPlatform()
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
new file mode 100644
index 0000000000000..170cfff94f90d
--- /dev/null
+++ b/vllm/platforms/hpu.py
@@ -0,0 +1,11 @@
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class HpuPlatform(Platform):
+    _enum = PlatformEnum.HPU
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index c3a3e7a284457..81d8bdae2383c 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -10,6 +10,7 @@ class PlatformEnum(enum.Enum):
     CUDA = enum.auto()
     ROCM = enum.auto()
     TPU = enum.auto()
+    HPU = enum.auto()
     XPU = enum.auto()
     CPU = enum.auto()
     NEURON = enum.auto()
@@ -46,6 +47,9 @@ def is_rocm(self) -> bool:
     def is_tpu(self) -> bool:
         return self._enum == PlatformEnum.TPU
 
+    def is_hpu(self) -> bool:
+        return self._enum == PlatformEnum.HPU
+
     def is_xpu(self) -> bool:
         return self._enum == PlatformEnum.XPU
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 6edc8d72f6bcf..d78130873d3dc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -728,6 +728,9 @@ def is_pin_memory_available() -> bool:
     elif current_platform.is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
+    elif current_platform.is_hpu():
+        print_warning_once("Pin memory is not supported on HPU.")
+        return False
     elif current_platform.is_cpu() or current_platform.is_openvino():
         return False
     return True
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
new file mode 100644
index 0000000000000..5008a2abd22ea
--- /dev/null
+++ b/vllm/worker/hpu_model_runner.py
@@ -0,0 +1,2008 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import collections
+import contextlib
+import dataclasses
+import functools
+import gc
+import itertools
+import math
+import operator
+import os
+import time
+from array import array
+from dataclasses import dataclass, field
+from enum import IntEnum
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
+                    Optional, Set, Tuple, Type, TypeVar, Union)
+
+import habana_frameworks.torch as htorch
+import habana_frameworks.torch.internal.bridge_config as bc
+import torch
+from vllm_hpu_extension.ops import LoraMask as LoraMask
+from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
+                                         HabanaMemoryProfiler, format_bytes)
+
+from vllm.attention import AttentionMetadata, get_attn_backend
+from vllm.config import DeviceConfig, VllmConfig
+from vllm.distributed.parallel_state import get_world_group
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
+                             MultiModalInputs)
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (IntermediateTensors, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+logger = init_logger(__name__)
+
+_TYPE_CACHE = {}
+# These values are assumed to be zero in several places.
+# Use caution when updating them!
+_PAD_SLOT_ID = 0
+_PAD_BLOCK_ID = 0
+
+LORA_WARMUP_RANK = 8
+
+
+class Singleton(type):
+    _instances: Dict[type, object] = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton,
+                                        cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+
+@dataclass
+class HPUBucketingGlobalState(metaclass=Singleton):
+    prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    prompt_buckets: List[Tuple[int, int]] = field(init=False)
+    decode_buckets: List[Tuple[int, int]] = field(init=False)
+
+
+def subtuple(obj: object,
+             typename: str,
+             to_copy: List[str],
+             to_override: Optional[Dict[str, object]] = None):
+    if obj is None:
+        return None
+    if to_override is None:
+        to_override = {}
+    fields = set(to_copy) | set(to_override.keys())
+    values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
+    if typename not in _TYPE_CACHE:
+        _TYPE_CACHE[typename] = collections.namedtuple(typename,
+                                                       ' '.join(fields))
+    return _TYPE_CACHE[typename](**values)
+
+
+def read_bucket_settings(phase: str, dim: str, **defaults):
+    """Read bucketing configuration from env variables.
+
+    phase is either 'prompt' or 'decode'
+    dim is either 'bs', 'seq' or 'block'
+    param is either 'min', 'step' or 'max'
+    example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
+    """
+    params = ['min', 'step', 'max']
+    env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params]
+    default_values = [defaults[p] for p in params]
+    values = [
+        int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)
+    ]
+    for e, v, d in zip(env_vars, values, default_values):
+        logger.info('%s=%s (default:%s)', e, v, d)
+    return values
+
+
+def warmup_range(config: Tuple[int, int, int]):
+    """Generate a warmup range.
+
+    Start from bmin and multiply by 2 until you reach bstep.
+    Then, increase the values in the range by the value of bstep until you 
+    reach bmax.
+
+    Example:
+    bmin = 2, bstep = 32, bmax = 64
+    => ramp_up = (2, 4, 8, 16)
+    => stable = (32, 64)
+    => return ramp_up + stable => (2, 4, 8, 16, 32, 64)
+    """
+    bmin, bstep, bmax = config
+    assert bmin <= bmax, ("Min. batch size cannot be greater than max. "
+                          "batch size. If you want to skip warmup, "
+                          "set VLLM_SKIP_WARMUP=true")
+    base = itertools.repeat(2)
+    ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
+    ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
+        ramp_up_acc)
+    stable = range(bstep, bmax + 1, bstep)
+    buckets = list(ramp_up_tw) + list(stable)
+    return list(filter(lambda bucket: bucket >= bmin, buckets))
+
+
+def generate_prompt_buckets(bs_bucket_config,
+                            seq_bucket_config,
+                            max_num_batched_tokens=None):
+    buckets = list(
+        itertools.product(warmup_range(bs_bucket_config),
+                          warmup_range(seq_bucket_config)))
+    if len(buckets) == 0:
+        msg = ("No buckets could be captured with following config "
+               f"(min, step, max_warmup): "
+               f"bs:{bs_bucket_config}, "
+               f"seq:{seq_bucket_config}")
+        raise ValueError(msg)
+
+    filtered_buckets = buckets
+    if max_num_batched_tokens is not None:
+        # Remove buckets exceeding batch token budget
+        filtered_buckets = list(
+            filter(
+                lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
+                buckets))
+
+        if len(filtered_buckets) == 0:
+            # we can handle this if we ignore max_num_batched_tokens
+            min_bucket_bs, min_bucket_seq = min(buckets,
+                                                key=lambda b: (b[0] * b[1]))
+            min_reqd_budget = min_bucket_bs * min_bucket_seq
+            msg = (
+                "The current bucketing configuration "
+                f"(min, step, max_warmup): "
+                f"bs:{bs_bucket_config}, "
+                f"seq:{seq_bucket_config} cannot be used with specified "
+                f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
+                f"smallest bucket ({min_reqd_budget}) would exceed token "
+                "budget. Please increase max_num_batched_tokens or decrease "
+                "bucket minimum Ignoring max_num_batched_tokens at risk of "
+                "out-of-memory errors.")
+            logger.error(msg)
+            return list(
+                sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), []
+
+    captured_buckets = list(
+        sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
+    omitted_buckets = list(
+        sorted([x for x in buckets if x not in filtered_buckets]))
+    return captured_buckets, omitted_buckets
+
+
+def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
+                            max_blocks):
+    buckets = []
+    bs_buckets = warmup_range(bs_bucket_config)
+    block_buckets = warmup_range(blocks_bucket_config)
+    bmin, bstep, bmax = blocks_bucket_config
+    last_bucket = round_up(max_blocks, bstep)
+    for bs in bs_buckets:
+        for blocks in block_buckets:
+            if blocks < bs:
+                continue
+            if blocks > last_bucket:
+                break
+            buckets.append((bs, blocks))
+    return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
+
+
+def next_pow2(value: int, base: int):
+    res = base
+    while value > 1:
+        value = (value + 1) // 2
+        res *= 2
+    return res
+
+
+def round_up(value: int, k: int):
+    return (value + k - 1) // k * k
+
+
+def find_bucket(value: int, config: Tuple[int, int, int]):
+    bmin, bstep, _ = config
+    next_step = round_up(value, bstep)
+    next_pow = next_pow2(value, bmin)
+    return max(bmin, min(next_step, next_pow))
+
+
+def align_workers(value, op):
+    group = get_world_group().cpu_group
+    world_size = torch.distributed.get_world_size()
+    if world_size <= 1:
+        return value
+    value_t = torch.tensor(value, device='cpu')
+    torch.distributed.all_reduce(value_t, op=op, group=group)
+    return value_t.item()
+
+
+def setup_profiler():
+    schedule = torch.profiler.schedule(wait=0, warmup=2, active=1, repeat=1)
+    DEVICE = 'hpu'
+    activities = [torch.profiler.ProfilerActivity.CPU]
+    activities.extend([torch.profiler.ProfilerActivity.HPU] if DEVICE ==
+                      'hpu' else [])
+    #from habana_frameworks.torch.activity_profiler import DebugActivity
+    #debug_activities=[DebugActivity.BRIDGE_FUNCTION_CALLS]
+
+    profiler = torch.profiler.profile(
+        schedule=schedule,
+        activities=activities,
+        #debug_activities=debug_activities,
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('.',
+                                                                use_gzip=True),
+        record_shapes=False,
+        with_stack=True)
+    return profiler
+
+
+def pad_list(list, k, v):
+    target_len = round_up(len(list), k)
+    padding = target_len - len(list)
+    return list + [v] * padding
+
+
+def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
+    slot_mapping = slot_mapping.flatten()
+    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    if is_prompt:
+        indices = indices.unflatten(0, (-1, block_size))[:, 0]
+        offsets = None
+    else:
+        offsets = torch.fmod(slot_mapping, block_size)
+    return indices, offsets
+
+
+class HpuModelAdapter():
+
+    def __init__(self, model, block_size, dtype, enforce_eager):
+        self.model = model
+        self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
+                                               '0').lower() in ['1', 'true']
+        self.block_size = block_size
+        self.dtype = dtype
+        if not htorch.utils.internal.is_lazy() and not enforce_eager:
+            self.model = torch.compile(self.model,
+                                       backend='hpu_backend',
+                                       dynamic=False)
+
+    def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
+                       dtype):
+        prefill_metadata = attn_metadata
+        if prefill_metadata is None or self.prefill_use_fusedsdpa:
+            return attn_metadata
+
+        seq_lens_t = prefill_metadata.seq_lens_tensor
+        len_mask = (torch.arange(0, seq_len, device=device,
+                                 dtype=torch.int32).view(1, seq_len).ge(
+                                     seq_lens_t.unsqueeze(-1)).view(
+                                         batch_size, 1, 1, seq_len))
+        causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len),
+                                            device=device,
+                                            dtype=torch.bool),
+                                 diagonal=1)
+        mask = causal_mask.logical_or(len_mask)
+        attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
+            mask, -math.inf))
+        attn_metadata = prefill_metadata._replace(attn_bias=attn_bias)
+        return attn_metadata
+
+    def _set_block_mapping(self, metadata, batch_size, device, dtype):
+        mask = torch.arange(0,
+                            self.block_size,
+                            device=device,
+                            dtype=torch.int32).unsqueeze(0)
+        mask = mask >= metadata.block_usage.unsqueeze(-1)
+        attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
+            mask, -math.inf))
+        block_mapping = torch.nn.functional.one_hot(metadata.block_mapping,
+                                                    num_classes=batch_size)
+        block_mapping = block_mapping.to(dtype)
+        metadata = metadata._replace(block_mapping=block_mapping,
+                                     attn_bias=attn_bias)
+        return metadata
+
+    def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
+                         dtype):
+        if attn_metadata.is_prompt:
+            meta = attn_metadata
+            attn_metadata = self._set_attn_bias(meta, batch_size, seq_len,
+                                                device, dtype)
+        else:
+            meta = attn_metadata
+            attn_metadata = self._set_block_mapping(meta, batch_size, device,
+                                                    dtype)
+        return attn_metadata
+
+    def forward(self, *args, **kwargs):
+        kwargs = kwargs.copy()
+        selected_token_indices = kwargs.pop('selected_token_indices')
+        if 'warmup_mode' in kwargs:
+            kwargs.pop('warmup_mode')
+        input_ids = kwargs['input_ids']
+        kwargs['attn_metadata'] = self._update_metadata(
+            kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
+            input_ids.device, self.dtype)
+        LoraMask.setLoraMask(kwargs.pop('lora_mask'))
+        hidden_states = self.model(*args, **kwargs)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = hidden_states.index_select(0, selected_token_indices)
+        return hidden_states
+
+    def compute_logits(self, *args, **kwargs):
+        return self.model.compute_logits(*args, **kwargs)
+
+    def sample(self, *args, **kwargs):
+        return self.model.sample(*args, **kwargs)
+
+
+class PreparePromptMetadata(NamedTuple):
+    input_tokens: torch.Tensor
+    input_positions: List[List[int]]
+    attn_metadata: Optional[AttentionMetadata]
+    seq_lens: List[int]
+    query_lens: List[int]
+    lora_index_mapping: List[List[int]]
+    lora_prompt_mapping: List[List[int]]
+    lora_requests: Set[LoRARequest]
+    multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]]
+    slot_mapping: List[List[int]]
+    lora_ids: List[int]
+
+    @classmethod
+    def empty(cls):
+        return PreparePromptMetadata(input_tokens=[],
+                                     input_positions=[],
+                                     attn_metadata=None,
+                                     seq_lens=[],
+                                     query_lens=[],
+                                     lora_index_mapping=[],
+                                     lora_prompt_mapping=[],
+                                     lora_requests=set(),
+                                     multi_modal_kwargs=None,
+                                     slot_mapping=[],
+                                     lora_ids=[])
+
+
+class PrepareDecodeMetadata(NamedTuple):
+    input_tokens: torch.Tensor
+    input_positions: List[List[int]]
+    attn_metadata: Optional[AttentionMetadata]
+    lora_index_mapping: List[List[int]]
+    lora_prompt_mapping: List[List[int]]
+    lora_requests: Set[LoRARequest]
+    slot_mapping: List[List[int]]
+    lora_ids: List[int]
+
+    @classmethod
+    def empty(cls):
+        return PrepareDecodeMetadata(input_tokens=[],
+                                     input_positions=[],
+                                     attn_metadata=None,
+                                     lora_index_mapping=[],
+                                     lora_prompt_mapping=[],
+                                     lora_requests=set(),
+                                     slot_mapping=[],
+                                     lora_ids=[])
+
+
+# How batches are constructed.
+class BatchType(IntEnum):
+    # Every batch is prefill.
+    PREFILL = 0
+    # Every batch is decode.
+    DECODE = 1
+    # Batch is a mixture of prefill and decode.
+    MIXED = 2
+
+
+TModelInputForHPU = TypeVar('TModelInputForHPU', bound="ModelInputForHPU")
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForHPU(ModelRunnerInputBase):
+    """
+    This base class contains metadata needed for the base model forward pass
+    but not metadata for possible additional steps, e.g., sampling. Model
+    runners that run additional steps should subclass this method to add
+    additional fields.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    seq_lens: Optional[List[int]] = None
+    query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+    real_batch_size: Optional[int] = None
+    batch_size_padded: Optional[int] = None
+    virtual_engine: int = 0
+    lora_ids: Optional[List[int]] = None
+    async_callback: Optional[Callable] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+            "real_batch_size": self.real_batch_size,
+            "batch_size_padded": self.batch_size_padded,
+            "virtual_engine": self.virtual_engine,
+            "lora_ids": self.lora_ids,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[TModelInputForHPU],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> TModelInputForHPU:
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU):
+    """
+    Used by the ModelRunner.
+    """
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    # Used for speculative decoding. We do not broadcast it because it is only
+    # used by the driver worker.
+    is_prompt: Optional[bool] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+            "lora_ids": self.lora_ids,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForHPUWithSamplingMetadata":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        # FIXME(kzawora): this fails for whatever reason - why?
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
+    """
+    Helper class for shared methods between GPU model runners.
+    """
+    _model_input_cls: Type[TModelInputForHPU]
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        is_driver_worker: bool = False,
+        return_hidden_states: bool = False,
+    ):
+        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        self.is_driver_worker = is_driver_worker
+        self.return_hidden_states = return_hidden_states
+
+        self.sliding_window = (self.model_config.get_sliding_window()
+                               if self.model_config is not None else None)
+        self.device_config = (self.device_config if self.device_config
+                              is not None else DeviceConfig())
+        self.device = self.device_config.device
+        self.enforce_eager = self.model_config.enforce_eager
+        self.max_num_seqs = self.scheduler_config.max_num_seqs
+        # NOTE(kzawora): Change that to scheduler_config.max_num_prefill_seqs
+        # once padding-aware scheduling gets merged
+        self.max_num_prefill_seqs = 64
+        self.max_model_len = self.scheduler_config.max_model_len
+        self.max_num_batched_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        self.block_size = self.cache_config.block_size
+
+        self.pin_memory = is_pin_memory_available()
+        self.kv_cache_dtype = self.cache_config.cache_dtype
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_head_size(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+            self.model_config.is_attention_free,
+        )
+
+        # Lazy initialization
+        self.lora_manager: LRUCacheWorkerLoRAManager = None
+        self.model: torch.nn.Module = None
+        self.inc_initialized_successfully = False
+
+        # Profiler stats
+        self.profiler = HabanaHighLevelProfiler()
+        self.profiler_counter_helper = HabanaProfilerCounterHelper()
+        self.seen_configs: set = set()
+        self._mem_margin: Optional[int] = None
+        self.bucketing_global_state = HPUBucketingGlobalState()
+        self._setup_buckets()
+        self._set_gc_threshold()
+
+    def _set_gc_threshold(self) -> None:
+        # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
+        # for comprehensive description of gc generations.
+        # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority)
+        # to set particular generation threshold or use simpler
+        # VLLM_GC_THR_MULTIPLIER to multiply default values.
+        default_gc_thrs = list(gc.get_threshold())
+        requested_gc_thrs = [0] * len(default_gc_thrs)
+        for i in range(len(default_gc_thrs)):
+            requested_gc_thrs[i] = int(
+                os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i]))
+        if requested_gc_thrs == default_gc_thrs:
+            gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER',
+                                                   2))
+            requested_gc_thrs = [
+                t * gc_thr_multiplier for t in default_gc_thrs
+            ]
+        gc.set_threshold(*requested_gc_thrs)
+
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
+        self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP',
+                                          'false').lower() == 'true'
+
+    def load_model(self) -> None:
+        import habana_frameworks.torch.core as htcore
+        if self.model_config.quantization == 'inc' or \
+           self.model_config.quantization == 'fp8':
+            htcore.hpu_set_env()
+        with HabanaMemoryProfiler() as m:
+            with HabanaMemoryProfiler() as m_getmodel:
+                self.model = get_model(vllm_config=self.vllm_config)
+            msg = ("Pre-loading model weights on "
+                   f"{next(self.model.parameters()).device} "
+                   f"took {m_getmodel.get_summary_string()}")
+            logger.info(msg)
+
+            if self.lora_config:
+                assert hasattr(self.model, "supported_lora_modules"
+                               ) and self.model.supported_lora_modules, (
+                                   "Model does not support LoRA")
+                assert hasattr(self.model, "embedding_modules"
+                               ), "Model does not have embedding_modules"
+                assert hasattr(
+                    self.model, "embedding_padding_modules"
+                ), "Model does not have embedding_padding_modules"
+                self.lora_manager = LRUCacheWorkerLoRAManager(
+                    self.scheduler_config.max_num_seqs,
+                    self.scheduler_config.max_num_batched_tokens,
+                    self.vocab_size, self.lora_config, self.device,
+                    self.model.embedding_modules,
+                    self.model.embedding_padding_modules)
+                self.model = self.lora_manager.create_lora_manager(self.model)
+
+            if self.model_config.quantization == 'inc':
+                logger.info("Preparing model with INC..")
+                with HabanaMemoryProfiler() as m_inc:
+                    from neural_compressor.torch.quantization import (
+                        FP8Config, convert, prepare)
+                    config = FP8Config.from_json_file(
+                        os.getenv("QUANT_CONFIG", ""))
+                    if config.measure:
+                        self.model = prepare(self.model, config)
+                    elif config.quantize:
+                        self.model = convert(self.model, config)
+                    htcore.hpu_initialize(self.model,
+                                          mark_only_scales_as_const=True)
+                self.inc_initialized_successfully = True
+                logger.info("Preparing model with INC took %s",
+                            m_inc.get_summary_string())
+            else:
+                self.model = self.model.to("hpu")
+                htcore.mark_step()
+            torch.hpu.synchronize()
+
+            with HabanaMemoryProfiler() as m_wrap:
+                self.model = _maybe_wrap_in_hpu_graph(
+                    self.model,
+                    self.block_size,
+                    dtype=self.model_config.dtype,
+                    enforce_eager=self.enforce_eager)
+            msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
+            logger.info(msg)
+
+        self.model_memory_usage = m.consumed_device_memory
+        msg = f"Loading model weights took in total {m.get_summary_string()}"
+        logger.info(msg)
+
+    def _use_graphs(self, batch_size, seq_len, is_prompt):
+        if self.enforce_eager:
+            return False
+        if self.skip_warmup:
+            return True
+        return (batch_size, seq_len, is_prompt) in self.graphed_buckets
+
+    def _is_valid_bucket(self, bucket):
+        return bucket[0] * bucket[1] <= self.max_num_batched_tokens
+
+    def _setup_buckets(self) -> None:
+        align_bs = lambda x: min(self.max_num_seqs, x)
+        #FIXME: The default values should be max_model_len
+        max_prompt_seq = 1024
+        max_decode_seq = 2048
+        self.bucketing_global_state.prompt_bs_bucket_cfg = read_bucket_settings(
+            'prompt',
+            'bs',
+            min=1,
+            step=align_bs(32),
+            max=self.max_num_prefill_seqs)
+        self.bucketing_global_state.decode_bs_bucket_cfg = read_bucket_settings(
+            'decode', 'bs', min=1, step=align_bs(32), max=self.max_num_seqs)
+        self.bucketing_global_state.prompt_seq_bucket_cfg = \
+            read_bucket_settings(
+            'prompt',
+            'seq',
+            min=self.block_size,
+            step=self.block_size,
+            max=max_prompt_seq)
+        self.bucketing_global_state.decode_block_bucket_cfg = \
+            read_bucket_settings(
+            'decode',
+            'block',
+            min=self.block_size,
+            step=self.block_size,
+            max=max(self.block_size,
+                    self.max_num_seqs * max_decode_seq // self.block_size))
+        self.graphed_buckets: Set[Any] = set()
+
+        msg = ("Prompt bucket config (min, step, max_warmup) "
+               f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, "
+               f"seq:{self.bucketing_global_state.prompt_seq_bucket_cfg}")
+        logger.info(msg)
+
+        msg = ("Decode bucket config (min, step, max_warmup) "
+               f"bs:{self.bucketing_global_state.decode_bs_bucket_cfg}, "
+               f"block:{self.bucketing_global_state.decode_block_bucket_cfg}")
+        logger.info(msg)
+
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> PreparePromptMetadata:
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        lora_index_mapping: List[List[int]] = []
+        lora_prompt_mapping: List[List[int]] = []
+        lora_requests: Set[LoRARequest] = set()
+
+        seq_lens: List[int] = []
+        context_lens: List[int] = []
+        query_lens: List[int] = []
+        prefix_block_tables: List[List[int]] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
+
+        if len(seq_group_metadata_list) == 0:
+            return PreparePromptMetadata.empty()
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+
+            computed_block_nums = seq_group_metadata.computed_block_nums
+            if (self.scheduler_config is not None
+                    and self.scheduler_config.chunked_prefill_enabled
+                    and not (computed_block_nums is None
+                             or computed_block_nums == [])):
+                raise RuntimeError(
+                    "chunked prefill cannot be used with prefix caching "
+                    "now.")
+
+            token_chunk_size = seq_group_metadata.token_chunk_size
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            context_len = seq_data.get_num_computed_tokens()
+            # We should use get_len here because in case of preemption
+            # it contains output tokens.
+            seq_len = min(seq_data.get_len(), context_len + token_chunk_size)
+            prompt_tokens = seq_data.get_token_ids()[context_len:seq_len]
+            seq_lens.append(seq_len)
+
+            # NOTE: This only works for oooooooxxx style attention.
+            if computed_block_nums is not None and len(
+                    computed_block_nums) > 0 and self.sliding_window is None:
+                # Prefix is not supported with sliding_window
+                context_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[context_len:]
+                prefix_block_tables.append(computed_block_nums)
+            elif self.scheduler_config.chunked_prefill_enabled:
+                if seq_group_metadata.block_tables is not None:
+                    # Prefill has chunked before.
+                    block_table = seq_group_metadata.block_tables[seq_id]
+                    prefix_block_tables.append(block_table)
+                else:
+                    # The first prefill.
+                    prefix_block_tables.append([])
+            else:
+                prefix_block_tables.append([])
+                # Right now, prefill start is always 0. However, this
+                # assumption can be changed once chunked prefill is introduced.
+                assert context_len == 0
+
+            # actual prompt lens
+            context_lens.append(context_len)
+            query_lens.append(seq_len - context_len)
+            input_tokens.append(prompt_tokens)
+            # NOTE(woosuk): Here we assume that the first token in the prompt
+            # is always the first token in the sequence.
+            input_positions.append(list(range(context_len, seq_len)))
+
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
+            if seq_group_metadata.block_tables is None:
+                # During memory profiling, the block tables are not initialized
+                # yet. In this case, we just use a dummy slot mapping.
+                slot_mapping.append([_PAD_SLOT_ID] * seq_len)
+                continue
+
+            # Compute the slot mapping.
+            slot_mapping.append([])
+            block_table = seq_group_metadata.block_tables[seq_id]
+
+            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+            # where start_idx is max(0, seq_len - sliding_window).
+            # For example, if the prompt len is 10, sliding window is 8, and
+            # block size is 4, the first two tokens are masked and the slot
+            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+            start_idx = 0
+            if self.sliding_window is not None:
+                assert context_len == 0, (
+                    "Prefix caching is currently not supported with "
+                    "sliding window attention")
+                start_idx = max(0, seq_len - self.sliding_window)
+            for i in range(context_len, seq_len):
+                if i < start_idx:
+                    slot_mapping[-1].append(_PAD_SLOT_ID)
+                    continue
+
+                block_number = block_table[i // self.block_size]
+                block_offset = i % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping[-1].append(slot)
+
+        max_query_len = max(query_lens)
+        sum_query_len = sum(query_lens)
+        real_num_seqs = len(query_lens)
+        assert max_query_len > 0
+
+        max_prompt_len = max(
+            find_bucket(max(seq_lens),
+                        self.bucketing_global_state.prompt_seq_bucket_cfg),
+            self.block_size)
+
+        lora_ids: List[int] = []
+        for seq_group_metadata, context_len in zip(seq_group_metadata_list,
+                                                   context_lens):
+            lora_id = seq_group_metadata.lora_int_id
+            lora_ids.append(lora_id)
+
+            if lora_id > 0:
+                lora_requests.add(seq_group_metadata.lora_request)
+
+            lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
+            lora_prompt_mapping.extend(
+                [lora_id] *
+                (max_prompt_len - context_len
+                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            max_len=max_prompt_len,
+                                            pad=0,
+                                            dtype=torch.long,
+                                            device=self.device)
+
+        input_positions = make_tensor_with_pad(input_positions,
+                                               max_len=max_prompt_len,
+                                               pad=0,
+                                               dtype=torch.long,
+                                               device=self.device)
+
+        slot_mapping = make_tensor_with_pad(slot_mapping,
+                                            max_len=max_prompt_len,
+                                            pad=_PAD_SLOT_ID,
+                                            dtype=torch.long,
+                                            device=self.device)
+
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.long,
+                                       device=self.device)
+
+        block_indices, block_offsets = precompute_indices_and_offsets(
+            self.block_size, slot_mapping, True)
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=True,
+            block_list=None,
+            block_mapping=None,
+            block_usage=None,
+            block_indices=block_indices,
+            block_offsets=block_offsets,
+            block_scales=None,
+            attn_bias=None,
+            seq_lens_tensor=seq_lens_tensor,
+            num_prefills=real_num_seqs,
+            num_prefill_tokens=sum_query_len,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=
+            None  # FIXME(kzawora): mutli-modality will not work here
+        )
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
+
+        return PreparePromptMetadata(input_tokens=input_tokens,
+                                     input_positions=input_positions,
+                                     attn_metadata=attn_metadata,
+                                     seq_lens=seq_lens,
+                                     query_lens=query_lens,
+                                     lora_index_mapping=lora_index_mapping,
+                                     lora_prompt_mapping=lora_prompt_mapping,
+                                     lora_requests=lora_requests,
+                                     multi_modal_kwargs=multi_modal_kwargs,
+                                     slot_mapping=slot_mapping,
+                                     lora_ids=lora_ids)
+
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> PrepareDecodeMetadata:
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        seq_lens: List[int] = []
+        block_tables: List[List[int]] = []
+        lora_index_mapping: List[List[int]] = []
+        lora_prompt_mapping: List[List[int]] = []
+        lora_requests: Set[LoRARequest] = set()
+
+        if len(seq_group_metadata_list) == 0:
+            return PrepareDecodeMetadata.empty()
+        lora_ids: List[int] = []
+
+        dummy_slots = itertools.cycle(
+            range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size))
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+            assert seq_group_metadata.token_chunk_size == 1
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            lora_id = seq_group_metadata.lora_int_id
+            lora_ids.append(lora_id)
+
+            if lora_id > 0:
+                lora_requests.add(seq_group_metadata.lora_request)
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append([position])
+
+                seq_len = seq_len if self.sliding_window is None else min(
+                    seq_len, self.sliding_window)
+                seq_lens.append(seq_len)
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+                if len(block_table) == 0:
+                    block_number = _PAD_BLOCK_ID
+                else:
+                    block_number = block_table[position // self.block_size]
+                if block_number == _PAD_BLOCK_ID:
+                    slot = next(dummy_slots)
+                else:
+                    block_offset = position % self.block_size
+                    slot = block_number * self.block_size + block_offset
+                slot_mapping.append([slot])
+                lora_index_mapping.append(lora_id)
+                lora_prompt_mapping.append(lora_id)
+
+                if self.sliding_window is not None:
+                    sliding_window_blocks = (self.sliding_window //
+                                             self.block_size)
+                    block_table = block_table[-sliding_window_blocks:]
+                block_tables.append(block_table)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)
+
+        num_decode_tokens = sum(seq_lens)
+
+        blocks_used = [len(bt) for bt in block_tables if bt]
+        block_list = []
+        block_scales = []
+        for i, bt in enumerate(block_tables):
+            block_list.extend(bt)
+            blocks_in_group = len(bt)
+            if blocks_in_group > 0:
+                scale = 1.0 / blocks_in_group
+                block_scales.extend([scale] * blocks_in_group)
+
+        block_mapping_nested: List[List[int]] = [
+            [i] * b_u for i, b_u in enumerate(blocks_used)
+        ]
+        block_mapping: List[int] = list(
+            itertools.chain.from_iterable(block_mapping_nested))
+
+        last_block = [
+            sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping)
+        ]
+        block_usage = [[self.block_size] * (b_u - 1) + [lb]
+                       for b_u, lb in zip(blocks_used, last_block)]
+        block_usage = list(itertools.chain(*block_usage))
+
+        block_bucket_size = find_bucket(
+            len(block_list),
+            self.bucketing_global_state.decode_block_bucket_cfg)
+        block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
+        block_mapping = pad_list(block_mapping, block_bucket_size, -1)
+        block_usage = pad_list(block_usage, block_bucket_size, 1)
+        block_scales = pad_list(block_scales, block_bucket_size, 0.0)
+
+        block_list = torch.tensor(block_list,
+                                  dtype=torch.int,
+                                  device=self.device)
+        block_mapping = torch.tensor(block_mapping,
+                                     dtype=torch.long,
+                                     device=self.device)
+        block_usage = torch.tensor(block_usage,
+                                   dtype=self.model_config.dtype,
+                                   device=self.device)
+
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)
+
+        block_indices, block_offsets = precompute_indices_and_offsets(
+            self.block_size, slot_mapping, False)
+        block_scales = torch.tensor(block_scales,
+                                    dtype=self.model_config.dtype,
+                                    device=self.device)
+
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=False,
+            block_list=block_list,
+            block_mapping=block_mapping,
+            block_usage=block_usage,
+            block_indices=block_indices,
+            block_offsets=block_offsets,
+            block_scales=block_scales,
+            attn_bias=None,
+            seq_lens_tensor=None,
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None)
+        return PrepareDecodeMetadata(input_tokens=input_tokens,
+                                     input_positions=input_positions,
+                                     attn_metadata=attn_metadata,
+                                     lora_index_mapping=lora_index_mapping,
+                                     lora_prompt_mapping=lora_prompt_mapping,
+                                     lora_requests=lora_requests,
+                                     slot_mapping=slot_mapping,
+                                     lora_ids=lora_ids)
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[TModelInputForHPU, SamplingMetadata]:
+        if len(seq_group_metadata_list) == 0:
+            return self._model_input_cls(), None
+
+        input_tokens = None
+        input_positions = None
+        lora_mapping = None
+        lora_requests = None
+        multi_modal_kwargs = None
+        batch_type = None
+        seq_lens = None
+        query_lens = None
+        real_batch_size = None
+        batch_size_padded = None
+
+        self.event_start = self.profiler.get_timestamp_us()
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        base_event_name = 'prompt' if is_prompt else 'decode'
+        self.profiler.start('internal', base_event_name)
+
+        real_batch_size = len(seq_group_metadata_list)
+        bucket_cfg = self.bucketing_global_state.prompt_bs_bucket_cfg \
+            if is_prompt else self.bucketing_global_state.decode_bs_bucket_cfg
+        batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
+        batch_size_padding = batch_size_padded - real_batch_size
+        seq_group_metadata_list = seq_group_metadata_list.copy()
+        if batch_size_padding > 0:
+            dummy_seq_group_metadata = self.create_dummy_seq_group_metadata(
+                0, 0, is_prompt)
+            seq_group_metadata_list.extend(dummy_seq_group_metadata
+                                           for _ in range(batch_size_padding))
+
+        prefill_reqs = []
+        decode_reqs = []
+        for seq_group_meta in seq_group_metadata_list:
+            if seq_group_meta.is_prompt:
+                prefill_reqs.append(seq_group_meta)
+            else:
+                decode_reqs.append(seq_group_meta)
+
+        # Prepare input tensors.
+        (
+            input_tokens,
+            input_positions,
+            prefill_attn_metadata,
+            seq_lens,
+            query_lens,
+            lora_index_mapping,
+            lora_prompt_mapping,
+            lora_requests,
+            multi_modal_kwargs,
+            slot_mapping,
+            lora_ids,
+        ) = self._prepare_prompt(prefill_reqs)
+        (
+            decode_input_tokens,
+            decode_input_positions,
+            decode_attn_metadata,
+            decode_lora_index_mapping,
+            decode_lora_prompt_mapping,
+            decode_lora_requests,
+            decode_slot_mapping,
+            decode_lora_ids,
+        ) = self._prepare_decode(decode_reqs)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     seq_lens, query_lens,
+                                                     self.device,
+                                                     self.pin_memory)
+
+        if not self.scheduler_config.chunked_prefill_enabled:
+            assert (len(prefill_reqs) and len(decode_reqs)) == 0
+
+        num_prefills = len(seq_lens)
+        num_prefill_tokens = len(input_tokens)
+        num_decode_tokens = len(decode_input_tokens)
+
+        # NOTE(kzawora): Here we diverge from GPU code - we don't
+        # support mixed batches, so we either use decode or prefill
+        # inputs, without coalescing.
+        assert (num_prefills == 0 and num_decode_tokens > 0) or (
+            num_prefills > 0
+            and num_decode_tokens == 0), "HPU does not support mixed batches!"
+        if num_decode_tokens > 0:
+            input_tokens = decode_input_tokens
+            input_positions = decode_input_positions
+            slot_mapping = decode_slot_mapping
+            lora_index_mapping = decode_lora_index_mapping
+            lora_prompt_mapping = decode_lora_prompt_mapping
+            lora_requests = decode_lora_requests
+            lora_ids = decode_lora_ids
+
+        # FIXME: We need to adjust selected_token_indices to accommodate
+        # for padding
+        max_len = input_tokens.size(1)
+        paddings = [max_len - s for s in seq_lens]
+        paddings = [0] + paddings[:-1]
+        paddings = list(itertools.accumulate(paddings))
+        paddings_prompt_logprobs = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            if seq_group_metadata.sampling_params.prompt_logprobs is not None \
+                              and seq_group_metadata.is_prompt:
+                paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i])
+        paddings = torch.tensor(
+            paddings_prompt_logprobs if paddings_prompt_logprobs else paddings,
+            dtype=sampling_metadata.selected_token_indices.dtype,
+            device=sampling_metadata.selected_token_indices.device)
+        sampling_metadata.selected_token_indices.add_(paddings)
+
+        if self.lora_config:
+            lora_mapping = LoRAMapping(
+                **dict(index_mapping=lora_index_mapping,
+                       prompt_mapping=lora_prompt_mapping,
+                       is_prefill=(num_prefills > 0)))
+        else:
+            lora_mapping = None
+
+        if (prefill_attn_metadata is not None
+                and decode_attn_metadata is not None):
+            batch_type = BatchType.MIXED
+            raise NotImplementedError("Mixed batch is not supported on HPU")
+        elif prefill_attn_metadata is not None:
+            batch_type = BatchType.PREFILL
+        else:
+            batch_type = BatchType.DECODE
+
+        metadata_dict = {
+            "input_tokens": input_tokens,
+            "input_positions": input_positions,
+            "selected_token_indices": sampling_metadata.selected_token_indices,
+            "lora_requests": lora_requests,
+            "lora_mapping": lora_mapping,
+            "multi_modal_kwargs": multi_modal_kwargs,
+            "num_prefill_tokens": num_prefill_tokens,
+            "num_decode_tokens": num_decode_tokens,
+            "slot_mapping": slot_mapping,
+            "num_prefills": num_prefills,
+            "batch_type": batch_type,
+            "seq_lens": seq_lens,
+            "query_lens": query_lens
+        }
+        if prefill_attn_metadata is not None:
+            metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
+        else:
+            assert decode_attn_metadata is not None
+            metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
+
+        attn_metadata = prefill_attn_metadata if \
+            prefill_attn_metadata is not None else decode_attn_metadata
+
+        return self._model_input_cls(input_tokens=input_tokens,
+                                     seq_lens=seq_lens,
+                                     query_lens=query_lens,
+                                     input_positions=input_positions,
+                                     attn_metadata=attn_metadata,
+                                     lora_requests=lora_requests,
+                                     lora_mapping=lora_mapping,
+                                     multi_modal_kwargs=multi_modal_kwargs,
+                                     real_batch_size=real_batch_size,
+                                     batch_size_padded=batch_size_padded,
+                                     lora_ids=lora_ids), \
+                                        sampling_metadata
+
+    def _seq_len(self, attn_metadata):
+        if attn_metadata.num_prefills != 0:
+            return attn_metadata.slot_mapping.size(1)
+        else:
+            return attn_metadata.block_list.numel()
+
+    def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
+        # NOTE(kzawora): To anyone working on this in the future:
+        # Trimming metadata is required when using HPUGraphs.
+        # Attention metadata is going to be hashed by PT bridge, and
+        # appropriate HPUGraphs will be matched based on all inputs' hash.
+
+        # Before you put more keys in here, make sure you know their
+        # value type and make sure you know how it's going to be hashed.
+        # You can find that information in input_hash function
+        # in habana_frameworks/torch/hpu/graphs.py. You can also hash
+        # it manually with torch.hpu.graphs.input_hash(attention_metadata)
+
+        # If you use primitive types here - they will get hashed based
+        # on their value. You *will* get lots of excessive graph captures
+        # (and an OOM eventually) if you decide to put something like
+        # seq_len int here.
+        # If you absolutely need a scalar, put it in a tensor. Tensors
+        # get hashed using their metadata, not their values:
+        # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
+        # input_hash(123) != input_hash(321)
+        # input_hash("abc") != input_hash("cba")
+        attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
+            'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping',
+            'block_usage', 'slot_mapping', 'is_prompt', 'block_indices',
+            'block_offsets', 'block_scales'
+        ])
+        return attention_metadata
+
+    def create_dummy_seq_group_metadata(self,
+                                        group_id,
+                                        seq_len,
+                                        is_prompt,
+                                        lora_request=None):
+        sampling_params = SamplingParams(temperature=0)
+        num_blocks = math.ceil(seq_len / self.block_size)
+        seq_len = max(seq_len, 1)
+        if is_prompt:
+            input_len = seq_len
+            output_len = 0
+            block_tables = None
+        else:
+            input_len = seq_len - 1
+            output_len = 1
+            block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
+        prompt_token_ids = [0] * input_len
+        output_token_ids = [1] * output_len
+        prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
+        seq_data = SequenceData(prompt_token_ids_array)
+        seq_data.output_token_ids = output_token_ids
+        return SequenceGroupMetadata(request_id=str(group_id),
+                                     is_prompt=(output_len == 0),
+                                     seq_data={group_id: seq_data},
+                                     sampling_params=sampling_params,
+                                     block_tables=block_tables,
+                                     lora_request=lora_request)
+
+    def profile_run(self) -> None:
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        kv_caches = [None] * num_layers
+        max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1]
+        max_seq_len = min(
+            self.bucketing_global_state.prompt_seq_bucket_cfg[-1],
+            self.max_num_batched_tokens // max_batch_size)
+
+        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
+                             False, True)
+        return
+
+    def warmup_scenario(self,
+                        batch_size,
+                        seq_len,
+                        is_prompt,
+                        kv_caches,
+                        is_pt_profiler_run=False,
+                        is_lora_profile_run=False) -> None:
+        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+        scenario_name = ("warmup_"
+                         f"{'prompt' if is_prompt else 'decode'}_"
+                         f"bs{batch_size}_"
+                         f"seq{seq_len}_"
+                         f"graphs{'T' if use_graphs else 'F'}")
+        max_num_seqs = self.scheduler_config.max_num_seqs
+        # This represents the maximum number of different requests
+        # that will have unique loras, an therefore the max amount of memory
+        # consumption create dummy lora request copies from the lora request
+        # passed in, which contains a lora from the lora warmup path.
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
+        if self.lora_config and is_lora_profile_run:
+            assert self.lora_manager is not None
+            with self.lora_manager.dummy_lora_cache():
+                for idx in range(self.lora_config.max_loras):
+                    lora_id = idx + 1
+                    dummy_lora_request = LoRARequest(
+                        lora_name=f"warmup_{lora_id}",
+                        lora_int_id=lora_id,
+                        lora_local_path="/not/a/real/path",
+                    )
+                    self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                     rank=LORA_WARMUP_RANK)
+                    dummy_lora_requests.append(dummy_lora_request)
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
+        self.profiler.start('internal', scenario_name)
+        times = 3 if use_graphs or is_pt_profiler_run else 1
+        if self.lora_config and not is_lora_profile_run:
+            lora_mapping = LoRAMapping(
+                **dict(index_mapping=[0] * batch_size * seq_len,
+                       prompt_mapping=[0] * batch_size * seq_len,
+                       is_prefill=is_prompt))
+            self.set_active_loras(set(), lora_mapping)
+        if is_prompt:
+            seqs = [
+                self.create_dummy_seq_group_metadata(
+                    i,
+                    seq_len,
+                    is_prompt,
+                    lora_request=dummy_lora_requests_per_seq[i]
+                    if dummy_lora_requests_per_seq else None)
+                for i in range(batch_size)
+            ]
+        else:
+            # FIXME: seq_len is actually number of blocks
+            blocks = [seq_len // batch_size for _ in range(batch_size)]
+            blocks[0] += seq_len % batch_size
+            seqs = [
+                self.create_dummy_seq_group_metadata(
+                    i,
+                    b * self.block_size - 1,
+                    is_prompt,
+                    lora_request=dummy_lora_requests_per_seq[i]
+                    if dummy_lora_requests_per_seq else None)
+                for i, b in enumerate(blocks)
+            ]
+        torch.hpu.synchronize()
+        profiler = None
+        if is_pt_profiler_run and self.is_driver_worker:
+            profiler = setup_profiler()
+            profiler.start()
+        for _ in range(times):
+            inputs = self.prepare_model_input(seqs)
+            self.execute_model(inputs, kv_caches, warmup_mode=True)
+            torch.hpu.synchronize()
+            if profiler:
+                profiler.step()
+        if profiler:
+            profiler.stop()
+        self.profiler.end()
+        gc.collect()
+
+    def remove_all_loras(self):
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.remove_all_adapters()
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
+
+    def log_warmup(self, phase, i, max_i, batch_size, seq_len):
+        free_mem = format_bytes(
+            HabanaMemoryProfiler.current_free_device_memory())
+        dim = "num_blocks"
+        if phase == "Prompt":
+            dim = "seq_len"
+        msg = (f"[Warmup][{phase}][{i+1}/{max_i}] "
+               f"batch_size:{batch_size} "
+               f"{dim}:{seq_len} "
+               f"free_mem:{free_mem}")
+        logger.info(msg)
+
+    def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
+        for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
+            self.log_warmup('Prompt' if is_prompt else 'Decode', i,
+                            len(buckets), batch_size, seq_len)
+            self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
+
+    def warmup_graphs(self,
+                      strategy,
+                      buckets,
+                      is_prompt,
+                      kv_caches,
+                      available_mem,
+                      starting_mem=0,
+                      total_batch_seq=0.001):
+        total_mem = starting_mem
+        idx = 0
+        phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
+        num_candidates = len(buckets)
+        ordering : Union[Callable[[Any], Tuple[Any, Any]], \
+            Callable[[Any], Tuple[Any, Any, Any]]]
+        if strategy == 'min_tokens':
+            ordering = lambda b: (b[0] * b[1], b[1], b[0])
+        elif strategy == 'max_bs':
+            ordering = lambda b: (-b[0], b[1])
+        else:
+            raise NotImplementedError(
+                f'Unsupported graph allocation strategy: {strategy}')
+        buckets = list(sorted(buckets, key=ordering))
+        captured_all = True
+        for idx, (batch_size, seq_len) in enumerate(buckets):
+            # Graph memory usage is proportional to seq dimension in a batch
+            batch_seq = batch_size * seq_len if is_prompt else batch_size
+            mem_estimate = batch_seq / total_batch_seq * total_mem
+            if mem_estimate >= available_mem:
+                captured_all = False
+                continue
+            graphed_bucket = (batch_size, seq_len, is_prompt)
+            if graphed_bucket in self.graphed_buckets:
+                continue
+            self.graphed_buckets.add(graphed_bucket)
+            self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
+            with HabanaMemoryProfiler() as mem_prof:
+                self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
+            used_mem = align_workers(mem_prof.consumed_device_memory,
+                                     torch.distributed.ReduceOp.MAX)
+            available_mem -= used_mem
+            total_mem += used_mem
+            total_batch_seq += batch_seq
+
+        return total_mem, total_batch_seq, captured_all
+
+    def log_graph_warmup_summary(self, buckets, is_prompt, total_mem):
+        num_candidates = len(buckets)
+        phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
+        graphed = list(c[:2] for c in self.graphed_buckets
+                       if c[2] == is_prompt)
+        if num_candidates == 0:
+            num_candidates = 1
+        msg = (f'{phase} captured:{len(graphed)} '
+               f'({100 * len(graphed) / num_candidates:.1f}%) '
+               f'used_mem:{format_bytes(total_mem)} '
+               f'buckets:{sorted(list(graphed))}')
+        logger.info(msg)
+
+    @torch.inference_mode()
+    def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
+        if profile := os.environ.get('VLLM_PT_PROFILE', None):
+            phase, bs, seq_len, graph = profile.split('_')
+            is_prompt = phase == 'prompt'
+            graphs = graph == 't'
+            if graphs:
+                self.graphed_buckets.add((int(bs), int(seq_len), is_prompt))
+            self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches,
+                                 True)
+            raise AssertionError("Finished profiling")
+        if self.skip_warmup:
+            logger.info("Skipping warmup...")
+            return
+        self.profiler.start('internal', 'warmup')
+        max_blocks = kv_caches[0][0].size(0)
+
+        self.bucketing_global_state.prompt_buckets, prompt_omitted_buckets = \
+            generate_prompt_buckets(
+            self.bucketing_global_state.prompt_bs_bucket_cfg,
+            self.bucketing_global_state.prompt_seq_bucket_cfg,
+            self.max_num_batched_tokens)
+
+        msg = (f"Generated {len(self.bucketing_global_state.prompt_buckets)} "
+               f"prompt buckets [bs, seq]: \
+                {list(sorted(self.bucketing_global_state.prompt_buckets))}")
+        logger.info(msg)
+
+        msg = (f"Omitted {len(prompt_omitted_buckets)} "
+               "prompt buckets due to exceeded token budget "
+               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
+        logger.info(msg)
+
+        msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}"
+        logger.debug(msg)
+
+        self.bucketing_global_state.decode_buckets = generate_decode_buckets(
+            self.bucketing_global_state.decode_bs_bucket_cfg,
+            self.bucketing_global_state.decode_block_bucket_cfg, max_blocks)
+        logger.info("Generated %d decode buckets [bs, total_blocks]: %s",
+                    len(self.bucketing_global_state.decode_buckets),
+                    list(sorted(self.bucketing_global_state.decode_buckets)))
+
+        if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
+            cache_size_limit = len(
+                self.bucketing_global_state.prompt_buckets) + len(
+                    self.bucketing_global_state.decode_buckets) + 1
+            torch._dynamo.config.cache_size_limit = max(
+                cache_size_limit, torch._dynamo.config.cache_size_limit)
+            # Multiply by 8 to follow the original default ratio between
+            # the cache_size_limit and accumulated_cache_size_limit
+            torch._dynamo.config.accumulated_cache_size_limit = max(
+                cache_size_limit * 8,
+                torch._dynamo.config.accumulated_cache_size_limit)
+
+        start_mem = HabanaMemoryProfiler.current_device_memory_usage()
+        start_time = time.perf_counter()
+
+        compile_only_mode_context = functools.partial(bc.env_setting,
+                                                      "PT_COMPILE_ONLY_MODE",
+                                                      True)
+        can_use_compile_only_mode = True
+        try:
+            with compile_only_mode_context():
+                pass
+            logger.debug("Using PT_COMPILE_ONLY_MODE.")
+        except KeyError:
+            can_use_compile_only_mode = False
+            logger.warning('Cannot use PT_COMPILE_ONLY_MODE. '
+                           'Warmup time will be negatively impacted. '
+                           'Please update Gaudi Software Suite.')
+        with compile_only_mode_context(
+        ) if can_use_compile_only_mode else contextlib.nullcontext():
+            self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets,
+                                    True, kv_caches)
+            self.warmup_all_buckets(self.bucketing_global_state.decode_buckets,
+                                    False, kv_caches)
+
+            if not self.enforce_eager and htorch.utils.internal.is_lazy():
+                assert self.mem_margin is not None, \
+                    ("HabanaWorker.determine_num_available_blocks needs "
+                    "to be called before warming up the model.")
+                free_mem = HabanaMemoryProfiler.current_free_device_memory()
+                graph_free_mem = free_mem - self.mem_margin
+                graph_free_mem = align_workers(graph_free_mem,
+                                               torch.distributed.ReduceOp.MIN)
+                prompt_graph_mem_ratio = float(
+                    os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3'))
+                prompt_available_memory = (prompt_graph_mem_ratio *
+                                           graph_free_mem)
+                decode_available_memory = (graph_free_mem -
+                                           prompt_available_memory)
+                msg = (
+                    f"Using {format_bytes(graph_free_mem)}"
+                    f"/{format_bytes(free_mem)} "
+                    "of free device memory for HPUGraphs, "
+                    f"{format_bytes(prompt_available_memory)} for prompt and "
+                    f"{format_bytes(decode_available_memory)} for decode "
+                    f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})")
+                logger.info(msg)
+                prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY',
+                                                 'min_tokens')
+                decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY',
+                                                 'max_bs')
+                mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
+                    self.warmup_graphs(
+                    prompt_strategy, self.bucketing_global_state.prompt_buckets,
+                    True, kv_caches, prompt_available_memory)
+                mem_post_decode, decode_batch_seq, decode_captured_all = \
+                    self.warmup_graphs(
+                    decode_strategy, self.bucketing_global_state.decode_buckets,
+                    False, kv_caches, decode_available_memory)
+
+                # Not all prompt buckets were captured, but all decode buckets
+                # were captured and we have some free graph-allocated space
+                # left. Let's try to use it for capturing more prompt buckets.
+                if (mem_post_decode + mem_post_prompt < graph_free_mem
+                        and not prompt_captured_all and decode_captured_all):
+                    mem_post_prompt, _, prompt_captured_all = (
+                        self.warmup_graphs(
+                            prompt_strategy,
+                            self.bucketing_global_state.prompt_buckets, True,
+                            kv_caches,
+                            graph_free_mem - mem_post_prompt - mem_post_decode,
+                            mem_post_prompt, prompt_batch_seq))
+
+                # Not all decode buckets were captured, but all prompt buckets
+                # were captured and we have some free graph-allocated space
+                # left. Let's try to use it for capturing more decode buckets.
+                if mem_post_decode + mem_post_prompt < graph_free_mem \
+                    and not decode_captured_all \
+                        and prompt_captured_all:
+                    mem_post_decode, _, _ = self.warmup_graphs(
+                        decode_strategy,
+                        self.bucketing_global_state.decode_buckets, False,
+                        kv_caches,
+                        graph_free_mem - mem_post_prompt - mem_post_decode,
+                        mem_post_decode, decode_batch_seq)
+
+                self.log_graph_warmup_summary(
+                    self.bucketing_global_state.prompt_buckets, True,
+                    mem_post_prompt)
+                self.log_graph_warmup_summary(
+                    self.bucketing_global_state.decode_buckets, False,
+                    mem_post_decode)
+
+        end_time = time.perf_counter()
+        end_mem = HabanaMemoryProfiler.current_device_memory_usage()
+        elapsed_time = end_time - start_time
+        msg = (
+            f"Warmup finished in {elapsed_time:.0f} secs, "
+            f"allocated {format_bytes(end_mem - start_mem)} of device memory")
+        logger.info(msg)
+        self.profiler.end()
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
+    @property
+    def mem_margin(self) -> Optional[int]:
+        return self._mem_margin
+
+    @mem_margin.setter
+    def mem_margin(self, value):
+        self._mem_margin = value
+
+
+def _maybe_wrap_in_hpu_graph(*args, **kwargs):
+    return htorch.hpu.wrap_in_hpu_graph(
+        HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
+    ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs)
+
+
+class HabanaProfilerCounterHelper():
+
+    def __init__(self):
+        self.niter = 0
+        self.average_real_throughput = None
+        self.logged_once = False
+        self.real_seq_lens = []
+        self.prompt_seq_lens = []
+
+    def capture_seq_group_metadata_stats(self, seq_group_metadata_list):
+        self.real_seq_lens = [
+            len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids)
+            for seq_group_metadata in seq_group_metadata_list
+            for seq_data in seq_group_metadata.seq_data.values()
+        ]
+        self.prompt_seq_lens = [
+            len(seq_data.prompt_token_ids)
+            for seq_group_metadata in seq_group_metadata_list
+            for seq_data in seq_group_metadata.seq_data.values()
+        ]
+
+    def get_counter_dict(self, cache_config, duration, seq_len,
+                         batch_size_padded, real_batch_size, is_prompt):
+        throughput = batch_size_padded / (duration / 1e6)
+        throughput_effective = real_batch_size / (duration / 1e6)
+
+        real_max_seq_len = max(self.real_seq_lens)
+        real_num_tokens = sum(self.real_seq_lens)
+        padded_num_tokens = batch_size_padded * seq_len
+        batch_token_utilization = real_num_tokens / padded_num_tokens
+        if self.average_real_throughput is None:
+            self.average_real_throughput = throughput_effective
+        else:  # https://www.heikohoffmann.de/htmlthesis/node134.html
+            self.average_real_throughput = self.average_real_throughput + 1 / (
+                self.niter + 1) * (throughput_effective -
+                                   self.average_real_throughput)
+        phase = "prompt" if is_prompt else "decode"
+        counters = {
+            f'{phase}_bucket_batch_size': batch_size_padded,
+            f'{phase}_batch_size': real_batch_size,
+            f'{phase}_bucket_seq_len': seq_len,
+            f'{phase}_seq_len': real_max_seq_len,
+            f'{phase}_bucket_gen_throughput': throughput,
+            f'{phase}_real_gen_throughput': throughput_effective,
+            f'{phase}_batch_token_utilization': batch_token_utilization,
+            'average_real_throughput': self.average_real_throughput,
+            'engine_iteration': self.niter,
+        }
+        self.niter += 1
+        if is_prompt:
+            prompt_bucket_in_throughput = (seq_len * batch_size_padded) / (
+                duration / 1e6)
+            prompt_real_in_throughput = sum(
+                self.prompt_seq_lens) / (duration / 1e6)
+            counters[
+                f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput
+            counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput
+
+        # KV cache might not be created yet (e.g. for profiling run)
+        if cache_config.num_gpu_blocks is not None and \
+            cache_config.num_gpu_blocks != 0:
+            cache_num_blocks_used = [
+                math.ceil(sl / cache_config.block_size)
+                for sl in self.real_seq_lens
+            ]
+            cache_total_num_blocks_used = sum(cache_num_blocks_used)
+            num_cache_blocks = cache_config.num_gpu_blocks
+            cache_total_num_free_blocks = \
+                num_cache_blocks - cache_total_num_blocks_used
+            cache_computed_utilization = \
+                cache_total_num_blocks_used / num_cache_blocks
+            max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size)
+            batch_block_utilization = cache_total_num_blocks_used / (
+                batch_size_padded * max_blocks_per_seq)
+            counters['cache_num_blocks_used'] = cache_total_num_blocks_used
+            counters['cache_num_free_blocks'] = cache_total_num_free_blocks
+            counters['cache_computed_utilization'] = cache_computed_utilization
+            counters[
+                f'{phase}_batch_block_utilization'] = batch_block_utilization
+        if not self.logged_once:
+            counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks
+            counters[
+                'const_gpu_memory_utilization'] = \
+                    cache_config.gpu_memory_utilization
+            counters['const_block_size'] = cache_config.block_size
+            self.logged_once = True
+        return counters
+
+
+def unwrap_model(model):
+    if isinstance(model, torch._dynamo.eval_frame.OptimizedModule):
+        return unwrap_model(model._orig_mod)
+    else:
+        model = list(vars(model)['_modules'].values())[0]
+        modules = list(vars(model)['_modules'].values())
+        return modules
+
+
+class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
+    """
+    GPU model runner with sampling step.
+    """
+    _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = (
+        ModelInputForHPUWithSamplingMetadata)
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForHPUWithSamplingMetadata:
+        return (
+            ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+                tensor_dict,
+                attn_backend=self.attn_backend,
+            ))
+
+    @torch.inference_mode()
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForHPUWithSamplingMetadata:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        with self.profiler.record_event('internal', 'prepare_input_tensors'):
+            assert seq_group_metadata_list is not None
+            if self.profiler.enabled:
+                self.profiler_counter_helper.capture_seq_group_metadata_stats(
+                    seq_group_metadata_list=seq_group_metadata_list)
+            model_input, sampling_metadata = self.prepare_input_tensors(
+                seq_group_metadata_list)
+            assert model_input.attn_metadata is not None
+            is_prompt = model_input.attn_metadata.is_prompt
+
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt,
+                                   virtual_engine=virtual_engine)
+
+    def finish_measurements(self):
+        from neural_compressor.torch.quantization import finalize_calibration
+        finalize_calibration(self.model.model)
+
+    def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
+        cfg = (batch_size, seq_len, is_prompt)
+        seen = cfg in self.seen_configs
+        self.seen_configs.add(cfg)
+        if not seen and not warmup_mode:
+            phase = 'prompt' if is_prompt else 'decode'
+            logger.warning("Configuration: (%s, %s, %s) was not warmed-up!",
+                           phase, batch_size, seq_len)
+
+    def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int],
+                         is_prompt: bool):
+        '''
+        This is a helper function to create the mask for lora computations.
+        Lora Mask is needed to ensure we match the correct lora weights for the
+        for the request.
+        For Prompt phase we have 
+        lora_mask with shape (batch_size * seq_len, max_loras * max_rank)
+        lora_logits_mask with shape (batch_size, max_loras * max_rank)
+        For Decode phase we have both
+        lora_mask and lora_logits_mask with shape
+        (batch_size, max_loras * max_rank)
+        '''
+        lora_mask: torch.Tensor = None
+        lora_logits_mask: torch.Tensor = None
+        lora_index = 0
+
+        if self.lora_config:
+            if is_prompt:
+                lora_mask = torch.zeros(
+                    input_tokens.shape[0] * input_tokens.shape[1],
+                    (self.lora_config.max_loras) *\
+                        self.lora_config.max_lora_rank,
+                    dtype=self.lora_config.lora_dtype)
+                lora_logits_mask = torch.zeros(
+                    input_tokens.shape[0], (self.lora_config.max_loras) *
+                    self.lora_config.max_lora_rank,
+                    dtype=self.lora_config.lora_dtype)
+
+                ones = torch.ones(input_tokens.shape[1],
+                                  self.lora_config.max_lora_rank,
+                                  dtype=self.lora_config.lora_dtype)
+                logit_ones = torch.ones(1,
+                                        self.lora_config.max_lora_rank,
+                                        dtype=self.lora_config.lora_dtype)
+
+                for i in range(len(lora_ids)):
+                    if lora_ids[i] == 0:
+                        continue
+                    lora_index = self.lora_manager._adapter_manager.\
+                        lora_index_to_id.index(lora_ids[i])
+                    start_row = i * input_tokens.shape[1]
+                    end_row = start_row + input_tokens.shape[1]
+                    start_col = lora_index * self.lora_config.max_lora_rank
+                    end_col = start_col + self.lora_config.max_lora_rank
+                    lora_mask[start_row:end_row, start_col:end_col] = ones
+                    lora_logits_mask[i, start_col:end_col] = logit_ones
+                lora_mask = lora_mask.to('hpu')
+                lora_logits_mask = lora_logits_mask.to('hpu')
+            else:
+                lora_mask = torch.zeros(input_tokens.shape[0],
+                                        (self.lora_config.max_loras) *
+                                        self.lora_config.max_lora_rank,
+                                        dtype=self.lora_config.lora_dtype)
+                ones = torch.ones(1,
+                                  self.lora_config.max_lora_rank,
+                                  dtype=self.lora_config.lora_dtype)
+                for i in range(len(lora_ids)):
+                    if lora_ids[i] == 0:
+                        continue
+                    lora_index = self.lora_manager._adapter_manager.\
+                        lora_index_to_id.index(lora_ids[i])
+                    start_pos = lora_index * self.lora_config.max_lora_rank
+                    end_pos = start_pos + self.lora_config.max_lora_rank
+                    lora_mask[i, start_pos:end_pos] = ones
+                lora_mask = lora_mask.to('hpu')
+                lora_logits_mask = lora_mask
+
+        return lora_mask, lora_logits_mask
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForHPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+        warmup_mode=False,
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        if num_steps > 1:
+            raise ValueError(
+                "num_steps > 1 is not supported in HPUModelRunner")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+        input_tokens = model_input.input_tokens
+        input_positions = model_input.input_positions
+        attn_metadata = model_input.attn_metadata
+        sampling_metadata = model_input.sampling_metadata
+        real_batch_size = model_input.real_batch_size
+        batch_size_padded = model_input.batch_size_padded
+        assert input_tokens is not None
+        assert input_positions is not None
+        assert sampling_metadata is not None
+        assert attn_metadata is not None
+        is_prompt = attn_metadata.is_prompt
+        assert is_prompt is not None
+        batch_size = input_tokens.size(0)
+        seq_len = self._seq_len(attn_metadata)
+        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+        self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
+
+        lora_mask: torch.Tensor = None
+        lora_logits_mask: torch.Tensor = None
+        if self.lora_config:
+            assert model_input.lora_ids is not None
+            lora_mask, lora_logits_mask = self.create_lora_mask(
+                input_tokens, model_input.lora_ids, attn_metadata.is_prompt)
+
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": self.trim_attn_metadata(attn_metadata),
+            "intermediate_tensors": intermediate_tensors,
+            "lora_mask": lora_mask,
+            **(model_input.multi_modal_kwargs or {}),
+        }
+        if htorch.utils.internal.is_lazy():
+            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
+
+        htorch.core.mark_step()
+        if self.is_driver_worker:
+            model_event_name = ("model_"
+                                f"{'prompt' if is_prompt else 'decode'}_"
+                                f"bs{batch_size}_"
+                                f"seq{seq_len}_"
+                                f"graphs{'T' if use_graphs else 'F'}")
+        else:
+            model_event_name = 'model_executable'
+        with self.profiler.record_event('internal', model_event_name):
+            hidden_states = self.model.forward(
+                **execute_model_kwargs,
+                selected_token_indices=sampling_metadata.selected_token_indices
+            )
+
+        if self.lora_config:
+            LoraMask.setLoraMask(
+                lora_logits_mask.index_select(
+                    0, sampling_metadata.selected_token_indices))
+
+        # Compute the logits.
+        with self.profiler.record_event(
+                'internal', ('compute_logits_'
+                             f'{"prompt" if is_prompt else "decode"}_bs'
+                             f'{batch_size}_'
+                             f'seq{seq_len}')):
+            sampling_metadata.selected_token_indices = None
+            logits = self.model.compute_logits(hidden_states,
+                                               sampling_metadata)
+        htorch.core.mark_step()
+        # Only perform sampling in the driver worker.
+        if not self.is_driver_worker:
+            return []
+
+        if model_input.async_callback is not None:
+            model_input.async_callback()
+
+        # Sample the next token.
+        with self.profiler.record_event(
+                'internal', ('sample_'
+                             f'{"prompt" if is_prompt else "decode"}_'
+                             f'bs{batch_size}_'
+                             f'seq{seq_len}')):
+            output = self.model.sample(
+                logits=logits,
+                sampling_metadata=sampling_metadata,
+            )
+        output.outputs = output.outputs[:real_batch_size]
+        htorch.core.mark_step()
+
+        if self.is_driver_worker and self.profiler.enabled:
+            # Stop recording 'execute_model' event
+            self.profiler.end()
+            event_end = self.profiler.get_timestamp_us()
+            counters = self.profiler_counter_helper.get_counter_dict(
+                cache_config=self.cache_config,
+                duration=event_end - self.event_start,
+                seq_len=seq_len,
+                batch_size_padded=batch_size_padded,
+                real_batch_size=real_batch_size,
+                is_prompt=is_prompt)
+            self.profiler.record_counter(self.event_start, counters)
+        return [output]
+
+    def shutdown_inc(self):
+        can_finalize_inc = False
+        from contextlib import suppress
+        with suppress(AttributeError):
+            can_finalize_inc = (self.model_config.quantization == 'inc') and \
+                (self.model.model is not None) and \
+                self.inc_initialized_successfully and \
+                not getattr(self, "_is_inc_finalized", False)
+        if can_finalize_inc:
+            from neural_compressor.torch.quantization import (
+                finalize_calibration)
+            finalize_calibration(self.model.model)
+            self._is_inc_finalized = True
+
+    def __del__(self):
+        self.shutdown_inc()
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
new file mode 100644
index 0000000000000..493f7a9fad098
--- /dev/null
+++ b/vllm/worker/hpu_worker.py
@@ -0,0 +1,410 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import gc
+import os
+from typing import List, Optional, Set, Tuple, Type
+
+import habana_frameworks.torch as htorch  # noqa:F401
+import torch
+import torch.distributed
+from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import ExecuteModelRequest
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.hpu_model_runner import HPUModelRunner
+from vllm.worker.model_runner_base import ModelRunnerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
+
+logger = init_logger(__name__)
+
+
+class HPUWorker(LocalOrDistributedWorkerBase):
+    """A worker class that executes (a partition of) the model on a HPU.
+
+    Each worker is associated with a single HPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the HPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[ModelRunnerBase]] = None,
+    ) -> None:
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+        self.parallel_config.rank = rank
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+        self.model_runner: HPUModelRunner = HPUModelRunner(
+            vllm_config=vllm_config, is_driver_worker=is_driver_worker)
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: List[HPUCacheEngine]
+        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        self.hpu_cache: Optional[List[List[torch.tensor]]] = None
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.HPU,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
+
+    def start_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
+
+    def stop_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
+
+    def _set_env_vars(self):
+        local_rank = self.local_rank
+        if self.parallel_config.world_size == 1:
+            local_rank = -1
+        import os
+        os.environ["LOCAL_RANK"] = str(local_rank)
+        os.environ["ID"] = str(local_rank)
+        os.environ["WORLD_SIZE"] = str(self.parallel_config.world_size)
+        os.environ["RANK"] = str(self.rank)
+
+    def init_device(self) -> None:
+        if self.device_config.device.type == "hpu":
+            self.device = torch.device("hpu")
+            torch.hpu.set_device(self.device)
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        if self.model_config.quantization == 'inc':
+            self._set_env_vars()
+        init_worker_distributed_environment(self.parallel_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    @torch.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        with HabanaMemoryProfiler() as m:
+            self.model_runner.profile_run()
+            torch.hpu.synchronize()
+        msg = ("Model profiling run "
+               f"took {m.get_summary_string()}")
+        logger.info(msg)
+        # At this point we should've allocated the maximum workspace for all
+        # recipes we will use the extra memory for graphs/blocks
+        free_hpu_memory = torch.hpu.mem_get_info()[0]
+
+        cache_block_size = self.get_cache_block_size_bytes()
+        graph_reserved_mem = (float(
+            os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1'))
+                              if not self.model_config.enforce_eager else 0)
+        graph_headroom = 1 - graph_reserved_mem
+        available_hpu_memory = free_hpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        hpu_memory_margin = free_hpu_memory * (
+            1 - self.cache_config.gpu_memory_utilization)
+        self.model_runner.mem_margin = hpu_memory_margin
+        cache_size_bytes = available_hpu_memory * graph_headroom
+        graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom)
+        msg = (
+            f"Free device memory: {format_bytes(free_hpu_memory)}, "
+            f"{format_bytes(available_hpu_memory)} usable "
+            f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization}),"
+            f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs "
+            f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), "
+            f"{format_bytes(cache_size_bytes)} reserved for KV cache")
+        logger.info(msg)
+        num_hpu_blocks = int(cache_size_bytes // cache_block_size)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size)
+        num_hpu_blocks = max(num_hpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+
+        if self.model_runner.lora_manager:
+            self.model_runner.remove_all_loras()
+
+        gc.collect()
+        return num_hpu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Allocate GPU and CPU KV cache with the specified number of blocks.
+
+        This also warms up the model, which may record CUDA graphs.
+        """
+        raise_if_cache_size_invalid(num_gpu_blocks,
+                                    self.cache_config.block_size,
+                                    self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        with HabanaMemoryProfiler() as m:
+            self._init_cache_engine()
+            torch.hpu.synchronize()
+        msg = ("Initializing cache engine "
+               f"took {m.get_summary_string()}")
+        logger.info(msg)
+        self._warm_up_model()
+
+    def _init_cache_engine(self):
+        assert self.cache_config.num_gpu_blocks is not None
+        self.cache_engine = [
+            HPUCacheEngine(self.cache_config, self.model_config,
+                           self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.hpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+    def _warm_up_model(self) -> None:
+        # NOTE(kzawora): We should use virtual engine index here
+        # for pipeline parallelism. Using 0 for now.
+        assert self.hpu_cache is not None
+        self.model_runner.warmup_model(self.hpu_cache[0])
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    def finish_measurements(self):
+        self.model_runner.finish_measurements()
+
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return self.hpu_cache
+
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
+        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
+        # they contain parameters to launch cudamemcpyasync.
+        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
+                                         device="cpu",
+                                         dtype=torch.int64).view(-1, 2)
+        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
+                                          device="cpu",
+                                          dtype=torch.int64).view(-1, 2)
+        # `blocks_to_copy` is a gpu tensor. The src and tgt of
+        # blocks to copy are in the same device, and `blocks_to_copy`
+        # can be used directly within cuda kernels.
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device=self.device,
+                                      dtype=torch.int64).view(-1, 2)
+
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+        )
+
+    @torch.inference_mode()
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def shutdown_inc(self):
+        self.model_runner.shutdown_inc()
+
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return HPUCacheEngine.get_cache_block_size(self.cache_config,
+                                                   self.model_config,
+                                                   self.parallel_config)
+
+
+def init_worker_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    """Initialize the distributed environment."""
+    init_distributed_environment(parallel_config.world_size,
+                                 rank,
+                                 distributed_init_method,
+                                 local_rank,
+                                 backend='hccl')
+
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+    if torch.distributed.is_initialized():
+        torch_world_size = torch.distributed.get_world_size()
+        if torch_world_size != parallel_config.world_size:
+            raise RuntimeError(
+                "torch.distributed is already initialized but the torch world "
+                "size does not match parallel_config.world_size "
+                f"({torch_world_size} vs. {parallel_config.world_size}).")
+    elif not distributed_init_method:
+        raise ValueError(
+            "distributed_init_method must be set if torch.distributed "
+            "is not already initialized")
+    else:
+        torch.distributed.init_process_group(
+            backend="hccl",
+            world_size=parallel_config.world_size,
+            rank=rank,
+            init_method=distributed_init_method,
+        )
+
+    # A small all_reduce for warmup & checking conformance.
+    dummy_tensor_hpu = torch.ones(1).to('hpu')
+    torch.distributed.all_reduce(dummy_tensor_hpu)
+    assert dummy_tensor_hpu.item() == parallel_config.world_size
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
+                                max_model_len) -> None:
+    if num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * num_gpu_blocks
+    if max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")
+
+
+class HPUCacheEngine(CacheEngine):
+
+    def _allocate_kv_cache(
+        self,
+        num_blocks: int,
+        device: str,
+    ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
+        """Allocates KV cache on the specified device."""
+        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
+        for _ in range(self.num_attention_layers):
+            key_cache = torch.zeros(kv_cache_shape,
+                                    dtype=self.dtype,
+                                    device=device)
+            value_cache = torch.zeros(kv_cache_shape,
+                                      dtype=self.dtype,
+                                      device=device)
+            kv_layer = (key_cache, value_cache)
+            kv_cache.append(kv_layer)
+        return kv_cache

From 6a585a23d2e7960164c7bd9d767858d50ac54c47 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 6 Nov 2024 01:24:28 -0800
Subject: [PATCH 84/85] [Hotfix] Fix ruff errors (#10073)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 setup.py                          | 3 +--
 vllm/executor/ray_hpu_executor.py | 2 +-
 vllm/worker/hpu_model_runner.py   | 7 +++----
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 51ca5e2abecf7..4a20e49235ac8 100644
--- a/setup.py
+++ b/setup.py
@@ -382,8 +382,7 @@ def get_gaudi_sw_version():
     output = subprocess.run("hl-smi",
                             shell=True,
                             text=True,
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE,
+                            capture_output=True,
                             env={"ENABLE_CONSOLE": "true"})
     if output.returncode == 0 and output.stdout:
         return output.stdout.split("\n")[2].replace(
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
index 28d1882cb0db7..a24bab6df370e 100644
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -34,7 +34,7 @@ class RayHPUExecutor(DistributedGPUExecutor):
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 5008a2abd22ea..7e9b2bd13b48a 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -67,8 +67,7 @@ class Singleton(type):
 
     def __call__(cls, *args, **kwargs):
         if cls not in cls._instances:
-            cls._instances[cls] = super(Singleton,
-                                        cls).__call__(*args, **kwargs)
+            cls._instances[cls] = super().__call__(*args, **kwargs)
         return cls._instances[cls]
 
 
@@ -273,7 +272,7 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
     return indices, offsets
 
 
-class HpuModelAdapter():
+class HpuModelAdapter:
 
     def __init__(self, model, block_size, dtype, enforce_eager):
         self.model = model
@@ -1643,7 +1642,7 @@ def _maybe_wrap_in_hpu_graph(*args, **kwargs):
     ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs)
 
 
-class HabanaProfilerCounterHelper():
+class HabanaProfilerCounterHelper:
 
     def __init__(self):
         self.niter = 0

From 2003cc35135319b240230e686f26f13524403ee0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 6 Nov 2024 17:49:19 +0800
Subject: [PATCH 85/85] [Model][LoRA]LoRA support added for LlamaEmbeddingModel
 (#10071)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/source/models/supported_models.rst |  2 +-
 vllm/model_executor/models/llama.py     | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 55835d945b00c..87f45cf695c8d 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -333,7 +333,7 @@ Text Embedding
   * - :code:`MistralModel`
     - Mistral-based
     - :code:`intfloat/e5-mistral-7b-instruct`, etc.
-    - 
+    - ✅︎
     - ✅︎
 
 .. important::
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 6c0a8b5ef8451..d768a57b7ef8a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -627,7 +627,7 @@ def permute(w: torch.Tensor, n_heads: int):
         return name, loaded_weight
 
 
-class LlamaEmbeddingModel(nn.Module, SupportsPP):
+class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
     """
     A model that uses Llama with additional embedding functionalities.
 
@@ -638,6 +638,19 @@ class LlamaEmbeddingModel(nn.Module, SupportsPP):
         model: An instance of LlamaModel used for forward operations.
         _pooler: An instance of Pooler used for pooling operations.
     """
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+    }
+    embedding_padding_modules = []
 
     def __init__(
         self,
@@ -679,3 +692,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
     def load_kv_cache_scales(self, quantization_param_path: str) -> None:
         self.model.load_kv_cache_scales(quantization_param_path)
+
+    # LRUCacheWorkerLoRAManager instantiation requires model config.
+    @property
+    def config(self):
+        return self.model.config